A <- 1:5
B <- 11:15
names(A) <- A
names(B) <- B
A
B
View(anscombe)
lm(y3~x3, data = anscombe)
##-- now some "magic" to do the 4 regressions in a loop:
ff <- y ~ x
mods <- setNames(as.list(1:4), paste0("lm", 1:4))
for(i in 1:4) {
  ff[2:3] <- lapply(paste0(c("y","x"), i), as.name)
  ## or   ff[[2]] <- as.name(paste0("y", i))
  ##      ff[[3]] <- as.name(paste0("x", i))
  mods[[i]] <- lmi <- lm(ff, data = anscombe)
  print(anova(lmi))
}
## See how close they are (numerically!)
sapply(mods, coef)
lapply(mods, function(fm) coef(summary(fm)))
# Aggregate function
#Splits the data into subsets, computes summary statistics for each, and returns the result in a convenient form
testDF <- data.frame(v1 = c(1,3,5,7,8,3,5,NA,4,5,7,9),
                     v2 = c(11,33,55,77,88,33,55,NA,44,55,77,99) )
by1 <- c("red", "blue", 1, 2, NA, "big", 1, 2, "red", 1, NA, 12)
by2 <- c("wet", "dry", 99, 95, NA, "damp", 95, 99, "red", 99, NA, NA)
aggregate(x = testDF, by = list(by1, by2), FUN = "mean")
## Now, do what you should have done in the first place: PLOTS
op <- par(mfrow = c(2, 2), mar = 0.1+c(4,4,1,1), oma =  c(0, 0, 2, 0))
for(i in 1:4) {
  ff[2:3] <- lapply(paste0(c("y","x"), i), as.name)
  plot(ff, data = anscombe, col = "red", pch = 21, bg = "orange", cex = 1.2,
       xlim = c(3, 19), ylim = c(3, 13))
  abline(mods[[i]], col = "blue")
}
mtext("Anscombe's 4 Regression data sets", outer = TRUE, cex = 1.5)
par(op)
x = c(TRUE, FALSE, FALSE)
typeof(x)  #logical (vector)
mode(x)
storage.mode(x)
y = 1:10
typeof(y)
mode(y)
storage.mode(y)
dim(y)
length(y)
z= list(1, TRUE, 'safs') #trying to get a list
typeof(z)
class(z)
z[3]
length(z)
dim(z)
quote(x+y)
as.list(quote(x + y))
1e3L #create constant
1L
cat(1+2)
'+'(1, 2)
x <- options()
x$prompt
match(NA, NaN)
match(NA, NA)
match(NaN, NaN)
x = array(1:8, c(2,4))
x
i=2
j=3
x[i]
x[i, j]
x[[i]]
x[[i, j]]
rownames(x)=c("a","b")
x
x = as.data.frame(x)
x
x["a",]
x[]
i <- matrix(1:4, 2, byrow = TRUE)
i
i[2,]
i[2, ,drop=FALSE] # keeping dimension 1 * n when selection a row
dim(i[2,])
dim(i[2, ,drop=FALSE])
# https://cran.r-project.org/doc/manuals/R-intro.pdf

help.start()
x <- rnorm(10000)
y <- rnorm(x)
plot(x, y)
hist(y)
ls()
rm(list=ls())
ls()
x <- 1:20
w <- 1 + sqrt(x)/2
dummy <- data.frame(x=x, y= x + rnorm(x)*w)
dummy 
# 4 Ordered and unordered factors
state <- c("tas", "sa", "qld", "nsw", "nsw", "nt", "wa", "wa",
"qld", "vic", "nsw", "vic", "qld", "qld", "sa", "tas",
"sa", "nt", "wa", "vic", "qld", "nsw", "nsw", "wa",
"sa", "act", "nsw", "vic", "vic", "act")
statef <- factor(state)
statef
levels(statef)
incomes <- c(60, 49, 40, 61, 64, 60, 59, 54, 62, 69, 70, 42, 56,
61, 61, 61, 58, 51, 48, 65, 49, 49, 41, 48, 52, 46,
59, 46, 58, 43)
incmeans <- tapply(incomes, statef, mean)
incmeans
# Arrays
a <- array(1:30, dim=c(2, 5,3))
a
x <- array(1:20, dim=c(4,5)) # Generate a 4 by 5 array.
x
i <- array(c(1:3,3:1), dim=c(3,2))
i
x[i] #get the 3 elements shown by i: (3, 1), (2, 2) and (1, 3)
help("<-")
# Back to http://zoonek2.free.fr/UNIX/48_R/02.html
x <- rnorm(10)
x
sort(x)
order(x)
x[order(x)]
x <- sample(1:5, 10, replace=T)
x
x[order(x)]
unique(x)
seq(0,10, length=11) == seq(0,10, by=1)
# Rep
rep(0, 10)
rep(1:5,3)
rep(1:5, each=3)
rep(1:5,2,each=3)
# Factor
x <- factor( sample(c("Yes", "No", "Perhaps"), 5, replace=T) )
x
# specify levels
l <- c("Yes", "No", "Perhaps")
x <- factor( sample(l, 5, replace=T), levels=l )
x
table(x)
# gl: Generate Factor Levels
gl(1,4)
gl(2,4)
gl(2,1,8)
gl(2,1,8, labels=c(T,F))
x <- gl(2,4)
x
y <- gl(2,1,8)
y
interaction(x,y)
data.frame(x,y, int=interaction(x,y))
# Cartesian product (toutes les possibilites)
x <- c("A", "B", "C")
y <- 1:2
z <- c("a", "b")
expand.grid(x,y,z)
x <- factor(c(3,4,5,1))
as.numeric( levels(x))
as.numeric( levels(x)[ x ] ) # proper way to convert from factor to numeric
# Data Frames
n <- 10
df <- data.frame( x=rnorm(n), y=sample(c(T,F),n,replace=T) )
str(df)
summary(df)
names(df);cat(rownames(df))
# Merge
merge(x, y)                 # INNER JOIN
merge(x, y, all.x = TRUE)   # LEFT JOIN
merge(x, y, all.y = TRUE)   # RIGHT JOIN
merge(x, y, all   = TRUE)   # OUTER JOIN
merge(a, b, by=c("y", "z")) # specify what to merge on
Error in fix.by(by.x, x) : 'by' must specify uniquely valid columns
# Regression
data(cars) 
#View(cars)

# Regression
lm.fit=lm( dist ~ speed, data=cars, na.action = na.exclude)
lm.fit
# Polynomial regression
lm.fit3 = lm( dist ~ poly(speed,3), data=cars)

#plot(lm.fit)
plot(cars$speed, cars$dist)
abline(lm.fit)
library(rpart.plot)
Loading required package: rpart
fit <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked,
               data=train,
               method="class", 
               control=rpart.control(minsplit=2, cp=0))
Error in is.data.frame(data) : object 'train' not found
# Lists
h <- list()
h[["foo"]] <- 1
h[["bar"]] <- c("a", "b", "c")
h["bar"] == h[["bar"]] #h["bar"] is a list containing the vector
# Delete
h[["bar"]] <- NULL
m <- matrix( c(1,2,3,4), nrow=2 )
m
solve(m)
x <- matrix( c(6,7), nrow=2 )
solve(m, x)
?solve
n <- 1000
x1 <- factor( sample(1:3, n, replace=T), levels=1:3 )
x2 <- factor( sample(LETTERS[1:5], n, replace=T), levels=LETTERS[1:5] )
x3 <- factor( sample(c(F,T),n,replace=T), levels=c(F,T) )
d <- data.frame(x1,x2,x3)
r <- table(d)
r
ftable(d) #easier reading
# contingency table into a data.frame
n <- 100
k <- 10
x <- factor( sample(LETTERS[1:k], n, replace=T), levels=LETTERS[1:k] )
x
d <- table(x)
x2 = factor( rep(names(d),d), levels=names(d) )
x2
# apply
options(digits=4)
df <- data.frame(x=rnorm(20),y=rnorm(20),z=rnorm(20))
apply(df,2,mean)
rownames(df) <- LETTERS[1:20]
apply(df, 1, mean)
gl(2,10,20)
tapply(1:20, gl(2,10,20), sum) # tapply: 2nd argument used for grouping

by(1:20, gl(2,10,20), sum)
x <- list(a=rnorm(10), b=runif(100), c=rgamma(50,1))
sapply(x,sd) # sapply: apply FUN on each element of vector
lapply(x,sd) # lapply: same but returns list
# Exercise: Let x be a boolean vector. Count the number of sequences ("runs") of zeros (for instance, in 00101001010110, there are 6 runs: 00 0 00 0 0 0). Count the number of sequences of 1. Counth the total number of sequences. Same question for a factor with more tham two levels.
n <- 50
x <- sample(0:1, n, replace=T, p=c(.2,.8))
x
diff(x, lag=1)
#Let r be the return of a financial asset. The clustered return is the accumulated return for a sequence of returns of the same sign. The trend number is the number of steps in such a sequence. The average return is their ratio. Compute these quantities.
data(EuStockMarkets)
x <- EuStockMarkets
# We aren't interested in the spot prices, but in the returns
# return[i] = ( price[i] - price[i-1] ) / price[i-1]
y <- apply(x, 2, function (x) { diff(x)/x[-length(x)] })
# We normalize the data
z <- apply(y, 2, function (x) { (x-mean(x))/sd(x) })
# A single time series
r <- z[,1]
# The runs
f <- factor(cumsum(abs(diff(sign(r))))/2)
r <- r[-1]
accumulated.return <- tapply(r, f, sum)
trend.number <- table(f)
boxplot(abs(accumulated.return) ~ trend.number, col='pink',
        main="Accumulated return")
# Strings
print("Hello\n")

cat("Hello\n") #use cat
paste("Hello", "World", "!", sep="") #concatenate
paste("Hello ", " World", "!", sep="")
x <- 5
paste("x=", x)
cat("x=", x, "\n", sep="\n")
s <- c("Hello", " ", "World", "!")
paste(s)
paste(s, sep="")
paste(s, collapse="")
paste(1:3, "Hello World!", sep=":")
nchar("Hello World!")
s <- "Hello World"
substring(s, 4, 6)
s <- "foo-->bar-->baz"
strsplit(s, "-->")
# Regex
s <- "foo, bar, baz"
strsplit(s, ", *")
s <- apply(matrix(LETTERS[1:24], nr=4), 2, paste, collapse="")
s
grep("O", s)
grep("O", s, value=T)
regexpr("o", "Hello")
regexpr("o", c("Hello", "World!"))
s <- "foo    bar baz"
gsub(" ", "", s)   # Remove all the spaces
gsub(" +", " ", s)  # Remove multiple spaces and replace them by single spaces
#The "sub" is similar to "gsub" but only replaces the first occurrence.
s <- "foo bar baz"
sub(" ", "", s)
# Dates
as.Date("2005-05-15") #ISO 8601
as.Date("15/05/2005", format="%d/%m/%Y")
as.Date("15/05/05", format="%d/%m/%y")
cat("\n")
as.Date("01/02/03", format="%y/%m/%d")
as.Date("01/02/03", format="%y/%d/%m")
as.Date("01/02/03", format="%y/%m/%d") - as.Date("01/02/03", format="%y/%d/%m")
Sys.Date()
format(Sys.Date(), format="%A, %d %B %Y")
seq(as.Date("2005-01-01"), as.Date("2005-07-01"), by="month")
seq(as.Date("2005-01-01"), as.Date("2005-07-01"), by=31)
methods(class="Date")
as.POSIXlt("2005-05-15 21:45:17")
as.POSIXct(Sys.Date())
# Reading from dataframes
# option 1
  #d <- read.table("foo.txt")
  #d$Date <- as.Date( as.character( d$Date ) )
# option 2
  #read.table("foo.txt", colClasses=c("Date", "character", rep(10, "numeric")))
options(warn=1)
methods(plot)
# Import 

# d <- read.table("foo.txt", header=T, sep=",")
# d <- read.csv("txt.csv")
# d <- read.csv2("txt.csv")  # semicolon-separated file, with a
#                            # comma instead of the decimal point.
# d <- read.delim("foo.txt") # Tab-delimited file
# d <- read.fwf("txt.fwf")   # Fixed width fields
# Excel: this may be trickier: the missing values often appear as "#N/A!" and are mistaken for the start of a comment... You can try

# d <- read.table("foo.csv", header = TRUE, sep = ",", 
#                 na.strings = c("#N/A!", "NA", "@NA"), 
#                 quote = '"',
#                 comment.char = "")
#If your file only contains number, or only strings, it is wiser to store it in a matrix, not a data.frame. This is what the "scan" function does.
# A numeric matrix
  # x <- scan("foo.txt", sep=",")  # Gives a numeric vector
  # n <- scan("foo.txt", sep=",", nlines=1)
  # x <- matrix(x, nc=n)

# A vector of strings
  #x <- scan("foo.txt", what=character(0))
# Back tohttps://cran.r-project.org/doc/manuals/R-intro.pdf - Regression

  # fm05 <- lm(y ~ x1 + x2 + x3 + x4 + x5, data = production)
  # fm6 <- update(fm05, . ~ . + x6)
  # smf6 <- update(fm6, sqrt(.) ~ .)
# Spine (from help)
require(graphics)

op <- par(mfrow = c(2,1), mgp = c(2,.8,0), mar = 0.1+c(3,3,3,1))
n <- 9
x <- 1:n
y <- rnorm(n)
plot(x, y, main = paste("spline[fun](.) through", n, "points"))
lines(spline(x, y))
lines(spline(x, y, n = 210), col = 2)
# NA handling - http://thomasleeper.com/Rcourse/Tutorials/NA.html
g1 <- c(1, 2, NA, NA, NA, 6, 7)
g2 <- na.omit(g1)
g2
attributes(g2)$na.action
sum(g1)
sum(g1, na.rm = TRUE)
# Cor -> can eliminate only pair-wise NAs ()
x <- c(1, 2, 3, NA, 5, 7, 9)
y <- c(3, 2, 4, 5, 1, 3, 4)
z <- c(NA, 2, 3, 5, 4, 3, 4)
m <- data.frame(x, y, z)
m
cor(m)  # returns all NAs
   x  y  z
x  1 NA NA
y NA  1 NA
z NA NA  1
cor(m, use = "complete.obs")
       x       y       z
x 1.0000 0.34819 0.70957
y 0.3482 1.00000 0.04583
z 0.7096 0.04583 1.00000
cor(m, use = "pairwise.complete.obs")
       x      y      z
x 1.0000 0.2498 0.7096
y 0.2498 1.0000 0.4534
z 0.7096 0.4534 1.0000
# Defaut for lm is also casewise deletion
lm <- lm(y ~ x + z, data = m)
summary(lm)

Call:
lm(formula = y ~ x + z, data = m)

Residuals:
     2      3      5      6      7 
-0.632  1.711 -1.237 -0.447  0.605 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)
(Intercept)    3.316      3.399    0.98     0.43
x              0.289      0.408    0.71     0.55
z             -0.632      1.396   -0.45     0.70

Residual standard error: 1.65 on 2 degrees of freedom
  (2 observations deleted due to missingness)
Multiple R-squared:  0.203, Adjusted R-squared:  -0.594 
F-statistic: 0.254 on 2 and 2 DF,  p-value: 0.797
# Checking length of data used for regression
length(m$y)
[1] 7
length(lm$fitted)
[1] 5
# checking where missing data is
is.na(m) # only useful for small examples
         x     y     z
[1,] FALSE FALSE  TRUE
[2,] FALSE FALSE FALSE
[3,] FALSE FALSE FALSE
[4,]  TRUE FALSE FALSE
[5,] FALSE FALSE FALSE
[6,] FALSE FALSE FALSE
[7,] FALSE FALSE FALSE
# image
image(is.na(m), main = "Missing Values", xlab = "Observation", ylab = "Variable", 
    xaxt = "n", yaxt = "n", bty = "n")
axis(1, seq(0, 1, length.out = nrow(m)), 1:nrow(m), col = "white")
axis(2, c(0, 0.5, 1), names(m), col = "white", las = 2)

# to remove casewise:
m2 <- na.omit(m) # use new variable to keep original dataframe
m
m2
# Mean and Random imputation
x2 <- x
x2[is.na(x2)] <- mean(x2, na.rm = TRUE)
x
[1]  1  2  3 NA  5  7  9
x2
[1] 1.0 2.0 3.0 4.5 5.0 7.0 9.0
# Random imputation - conserve mean and variance. Sample rest of the values to fill NAs
x3 <- x
x3[is.na(x3)] <- sample(x3[!is.na(x3)], sum(is.na(x3)), TRUE)
x
[1]  1  2  3 NA  5  7  9
x3
[1] 1 2 3 5 5 7 9
# Saving R data http://thomasleeper.com/Rcourse/Tutorials/savingdata.html 
set.seed(1)
mydf <- data.frame(x = rnorm(10), y = rnorm(10), z = rnorm(10))
save(mydf, file = "saveddf.RData")
unlink("saveddf.RData")
# dput to have a readable format (e.g. for stack overflow)
dput(mydf)
structure(list(x = c(-0.626453810742332, 0.183643324222082, -0.835628612410047, 
1.59528080213779, 0.329507771815361, -0.820468384118015, 0.487429052428485, 
0.738324705129217, 0.575781351653492, -0.305388387156356), y = c(1.51178116845085, 
0.389843236411431, -0.621240580541804, -2.2146998871775, 1.12493091814311, 
-0.0449336090152309, -0.0161902630989461, 0.943836210685299, 
0.821221195098089, 0.593901321217509), z = c(0.918977371608218, 
0.782136300731067, 0.0745649833651906, -1.98935169586337, 0.61982574789471, 
-0.0561287395290008, -0.155795506705329, -1.47075238389927, -0.47815005510862, 
0.417941560199702)), .Names = c("x", "y", "z"), row.names = c(NA, 
-10L), class = "data.frame")
dput(mydf, "saveddf.txt")
mydf2 <- dget("saveddf.txt")
mydf2
mydf==mydf2
          x     y     z
 [1,] FALSE FALSE FALSE
 [2,] FALSE FALSE FALSE
 [3,] FALSE FALSE  TRUE
 [4,] FALSE  TRUE FALSE
 [5,] FALSE FALSE FALSE
 [6,] FALSE FALSE FALSE
 [7,] FALSE FALSE FALSE
 [8,] FALSE FALSE FALSE
 [9,] FALSE FALSE FALSE
[10,]  TRUE FALSE FALSE
unlink("saveddf.text")
# csv
write.csv(mydf, file = "saveddf.csv")
unlink("savedf.csv")
# Dataframe rearrangement
set.seed(50)
mydf <- data.frame(a = rep(1:2, each = 10), b = rep(1:4, times = 5), c = rnorm(20), 
    d = rnorm(20), e = sample(1:20, 20, FALSE))
head(mydf)
# manual order change
head(mydf[, c("c", "d", "e", "a", "b")])
# mydf <- mydf[, c(3, 4, 5, 1, 2)]
# using order
order(mydf$e)
 [1]  2 16 18 19 20 14  3  4  5 12  1 11 13 15  7  8 10  9  6 17
head(mydf[order(mydf$e), ])
# Subset
mydf[mydf$a == 1, ]
mydf[mydf$a == 1 & mydf$b > 2, ]
subset(mydf, a == 1 & b > 2)
subset(mydf, select = c("a", "b"))
# Splitting 
split(mydf, mydf$a)
$`1`

$`2`
NA
split(mydf, list(mydf$a, mydf$b))
$`1.1`

$`2.1`

$`1.2`

$`2.2`

$`1.3`

$`2.3`

$`1.4`

$`2.4`
lapply(split(mydf, mydf$a), summary)
$`1`
       a           b              c                d                e        
 Min.   :1   Min.   :1.00   Min.   :-1.728   Min.   :-1.590   Min.   : 1.00  
 1st Qu.:1   1st Qu.:1.25   1st Qu.:-0.779   1st Qu.:-0.359   1st Qu.: 8.25  
 Median :1   Median :2.00   Median :-0.122   Median : 0.193   Median :13.00  
 Mean   :1   Mean   :2.30   Mean   :-0.244   Mean   : 0.299   Mean   :12.10  
 3rd Qu.:1   3rd Qu.:3.00   3rd Qu.: 0.483   3rd Qu.: 0.568   3rd Qu.:16.75  
 Max.   :1   Max.   :4.00   Max.   : 0.976   Max.   : 2.668   Max.   :19.00  

$`2`
       a           b              c                d                 e        
 Min.   :2   Min.   :1.00   Min.   :-1.166   Min.   :-1.1304   Min.   : 2.00  
 1st Qu.:2   1st Qu.:2.00   1st Qu.:-0.488   1st Qu.:-0.6338   1st Qu.: 4.25  
 Median :2   Median :3.00   Median :-0.343   Median : 0.2961   Median : 8.00  
 Mean   :2   Mean   :2.70   Mean   :-0.268   Mean   : 0.0793   Mean   : 8.90  
 3rd Qu.:2   3rd Qu.:3.75   3rd Qu.: 0.108   3rd Qu.: 0.4137   3rd Qu.:12.75  
 Max.   :2   Max.   :4.00   Max.   : 0.555   Max.   : 1.8397   Max.   :20.00  
lapply(split(mydf, mydf$a), summary)
$`1`
       a           b              c                d                e        
 Min.   :1   Min.   :1.00   Min.   :-1.728   Min.   :-1.590   Min.   : 1.00  
 1st Qu.:1   1st Qu.:1.25   1st Qu.:-0.779   1st Qu.:-0.359   1st Qu.: 8.25  
 Median :1   Median :2.00   Median :-0.122   Median : 0.193   Median :13.00  
 Mean   :1   Mean   :2.30   Mean   :-0.244   Mean   : 0.299   Mean   :12.10  
 3rd Qu.:1   3rd Qu.:3.00   3rd Qu.: 0.483   3rd Qu.: 0.568   3rd Qu.:16.75  
 Max.   :1   Max.   :4.00   Max.   : 0.976   Max.   : 2.668   Max.   :19.00  

$`2`
       a           b              c                d                 e        
 Min.   :2   Min.   :1.00   Min.   :-1.166   Min.   :-1.1304   Min.   : 2.00  
 1st Qu.:2   1st Qu.:2.00   1st Qu.:-0.488   1st Qu.:-0.6338   1st Qu.: 4.25  
 Median :2   Median :3.00   Median :-0.343   Median : 0.2961   Median : 8.00  
 Mean   :2   Mean   :2.70   Mean   :-0.268   Mean   : 0.0793   Mean   : 8.90  
 3rd Qu.:2   3rd Qu.:3.75   3rd Qu.: 0.108   3rd Qu.: 0.4137   3rd Qu.:12.75  
 Max.   :2   Max.   :4.00   Max.   : 0.555   Max.   : 1.8397   Max.   :20.00  
# sampling
s <- sample(1:nrow(mydf), 5, F) #no replacement
s
[1] 18  1 12 15  6
mydf[s,]
# test set
mydf[-s, ]
# Option 2
s2 <- rbinom(nrow(mydf), 1, 0.2)
s2
 [1] 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0
mydf[s2,]
mydf[-s2,]
library(car)

Attaching package: ‘car’

The following object is masked from ‘package:dplyr’:

    recode
b <- 1:20
#h <- recode(b, "1:5=1: 6:10=2; else=NA") # incredibly this creates an error
e <- recode(b, "1:5=1; 6:10=2; else=NA")
e
 [1]  1  1  1  1  1  2  2  2  2  2 NA NA NA NA NA NA NA NA NA NA
f <- recode(b, "lo:5=1; 6:10=2; 11:15=3; 16:hi=4; else=NA")
f
 [1] 1 1 1 1 1 2 2 2 2 2 3 3 3 3 3 4 4 4 4 4
e <- recode(h, "NA=99")
e
 [1]  1  2  3  4  5 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99
# Reconding on multiple variables
i <- expand.grid(1:4, 1:2)
i
interaction(i$Var1, i$Var2) # creates all possible combinations
[1] 1.1 2.1 3.1 4.1 1.2 2.2 3.2 4.2
Levels: 1.1 2.1 3.1 4.1 1.2 2.2 3.2 4.2
set.seed(1)
n <- 30
mydf <- data.frame(x1 = rbinom(n, 1, 0.5), x2 = rbinom(n, 1, 0.1), x3 = rbinom(n, 
    1, 0.5), x4 = rbinom(n, 1, 0.8), x5 = 1, x6 = sample(c(0, 1, NA), n, TRUE))
str(mydf)
'data.frame':   30 obs. of  6 variables:
 $ x1: int  0 0 1 1 0 1 1 1 1 0 ...
 $ x2: int  0 0 0 0 0 0 0 0 0 0 ...
 $ x3: int  1 0 0 0 1 0 0 1 0 1 ...
 $ x4: int  1 1 1 0 1 1 1 1 0 1 ...
 $ x5: num  1 1 1 1 1 1 1 1 1 1 ...
 $ x6: num  NA 1 1 0 NA 1 1 0 0 1 ...
mydf$x1 + mydf$x2 - mydf$x3 # vector operations
 [1] -1  0  1  1 -1  1  1  0  1 -1  0 -1  1  0  1 -1  0  1 -1  0  1 -1  1  0 -1  0 -1  0  1
[30]  0
with(mydf, x1+x2-x3)
 [1] -1  0  1  1 -1  1  1  0  1 -1  0 -1  1  0  1 -1  0  1 -1  0  1 -1  1  0 -1  0 -1  0  1
[30]  0
rowSums(mydf)
 [1] NA  3  4  2 NA  4  4  4  2  4  3  3  3  2 NA  4  5  4 NA  5 NA  4  3  2 NA  3  3 NA  3
[30] NA
rowSums(mydf, na.rm = T)
 [1] 3 3 4 2 3 4 4 4 2 4 3 3 3 2 3 4 5 4 2 5 2 4 3 2 3 3 3 2 3 2
data.frame(1:n, rowSums(mydf, na.rm = T))
rowMeans(mydf)
 [1]     NA 0.5000 0.6667 0.3333     NA 0.6667 0.6667 0.6667 0.3333 0.6667 0.5000 0.5000
[13] 0.5000 0.3333     NA 0.6667 0.8333 0.6667     NA 0.8333     NA 0.6667 0.5000 0.3333
[25]     NA 0.5000 0.5000     NA 0.5000     NA
apply(mydf, 1, var) # 2nd argument: 1 for rows, 2 for columns, c(1, 2)  rows & columns.
 [1]     NA 0.3000 0.2667 0.2667     NA 0.2667 0.2667 0.2667 0.2667 0.2667 0.3000 0.3000
[13] 0.3000 0.2667     NA 0.2667 0.1667 0.2667     NA 0.1667     NA 0.2667 0.3000 0.2667
[25]     NA 0.3000 0.3000     NA 0.3000     NA
apply(mydf, 2, var)
    x1     x2     x3     x4     x5     x6 
0.2575 0.0000 0.2483 0.1437 0.0000     NA 
sapply(mydf, var) # over list or vector
    x1     x2     x3     x4     x5     x6 
0.2575 0.0000 0.2483 0.1437 0.0000     NA 
newvar
 [1] 3 2 0 0 3 0 0 1 0 3 2 3 0 1 0 3 1 0 2 1 0 3 0 2 3 2 3 2 0 2
newvar[mydf$x1 == 1] <- with(mydf, x2 + x3)
number of items to replace is not a multiple of replacement length
# Matrices
set.seed(1)
a <- rnorm(100)
quantile(a, c(0.025, 0.975))
  2.5%  97.5% 
-1.671  1.797 
quantile(a, seq(0, 1, by = 0.1))
     0%     10%     20%     30%     40%     50%     60%     70%     80%     90%    100% 
-2.2147 -1.0527 -0.6139 -0.3753 -0.0767  0.1139  0.3771  0.5812  0.7713  1.1811  2.4016 
summary(as.logical(rbinom(1000, 1, 0.5)))
   Mode   FALSE    TRUE 
logical     492     508 
summary(factor(a))
   -2.2146998871775   -1.98935169586337   -1.80495862889104   -1.52356680042976 
                  1                   1                   1                   1 
  -1.47075238389927   -1.37705955682861   -1.27659220845804    -1.2536334002391 
                  1                   1                   1                   1 
  -1.22461261489836   -1.12936309608079   -1.04413462631653  -0.934097631644252 
                  1                   1                   1                   1 
 -0.835628612410047  -0.820468384118015  -0.743273208882405  -0.709946430921815 
                  1                   1                   1                   1 
  -0.70749515696212   -0.68875569454952  -0.626453810742332  -0.621240580541804 
                  1                   1                   1                   1 
 -0.612026393250771  -0.589520946188072  -0.573265414236886  -0.568668732818502 
                  1                   1                   1                   1 
  -0.54252003099165   -0.47815005510862  -0.473400636439312  -0.443291873218433 
                  1                   1                   1                   1 
  -0.41499456329968  -0.394289953710349  -0.367221476466509  -0.305388387156356 
                  1                   1                   1                   1 
 -0.304183923634301  -0.253361680136508  -0.164523596253587  -0.155795506705329 
                  1                   1                   1                   1 
 -0.135178615123832  -0.135054603880824  -0.112346212150228  -0.102787727342996 
                  1                   1                   1                   1 
-0.0593133967111857 -0.0561287395290008 -0.0538050405829051 -0.0449336090152309 
                  1                   1                   1                   1 
-0.0392400027331692 -0.0161902630989461 0.00110535163162413  0.0280021587806661 
                  1                   1                   1                   1 
 0.0743413241516641  0.0745649833651906   0.153253338211898   0.183643324222082 
                  1                   1                   1                   1 
  0.188792299514343   0.267098790772231   0.291446235517463   0.329507771815361 
                  1                   1                   1                   1 
  0.332950371213518   0.341119691424425    0.36458196213683   0.370018809916288 
                  1                   1                   1                   1 
  0.387671611559369   0.389843236411431   0.398105880367068   0.417941560199702 
                  1                   1                   1                   1 
  0.475509528899663   0.487429052428485   0.556663198673657   0.558486425565304 
                  1                   1                   1                   1 
  0.569719627442413   0.575781351653492   0.593901321217509   0.593946187628422 
                  1                   1                   1                   1 
  0.610726353489055    0.61982574789471   0.689739362450777   0.696963375404737 
                  1                   1                   1                   1 
  0.700213649514998   0.738324705129217   0.763175748457544   0.768532924515416 
                  1                   1                   1                   1 
  0.782136300731067   0.821221195098089   0.881107726454215   0.918977371608218 
                  1                   1                   1                   1 
  0.943836210685299    1.06309983727636    1.10002537198388    1.12493091814311 
                  1                   1                   1                   1 
   1.16040261569495     1.1780869965732    1.20786780598317    1.35867955152904 
                  1                   1                   1                   1 
   1.43302370170104    1.46555486156289    1.51178116845085    1.58683345454085 
                  1                   1                   1                   1 
   1.59528080213779    1.98039989850586    2.17261167036215    2.40161776050478 
                  1                   1                   1                   1 
# Tables
set.seed(1)
a <- sample(1:5, 25, T)
a
 [1] 2 2 3 5 2 5 5 4 4 1 2 1 4 2 4 3 4 5 2 4 5 2 4 1 2
table(a)
a
1 2 3 4 5 
3 8 2 7 5 
prop.table(table(a)) # to obtain percentages
a
   1    2    3    4    5 
0.12 0.32 0.08 0.28 0.20 
prop.table(table(a)) *100
a
 1  2  3  4  5 
12 32  8 28 20 
cbind(table(a), prop.table(table(a))*100)
  [,1] [,2]
1    3   12
2    8   32
3    2    8
4    7   28
5    5   20
# multi-variate
b <- rep(c(1, 2), length = 25)
table(a, b)
   b
a   1 2
  1 0 3
  2 5 3
  3 1 1
  4 5 2
  5 2 3
c <- rep(c(3, 4, 5), length = 25)
table(a, b, c)
, , c = 3

   b
a   1 2
  1 0 1
  2 3 1
  3 0 1
  4 1 0
  5 1 1

, , c = 4

   b
a   1 2
  1 0 0
  2 2 2
  3 0 0
  4 2 2
  5 0 0

, , c = 5

   b
a   1 2
  1 0 2
  2 0 0
  3 1 0
  4 2 0
  5 1 2
ftable(a, c, c)
    c 3 4 5
a c        
1 3   1 0 0
  4   0 0 0
  5   0 0 2
2 3   4 0 0
  4   0 4 0
  5   0 0 0
3 3   1 0 0
  4   0 0 0
  5   0 0 1
4 3   1 0 0
  4   0 4 0
  5   0 0 2
5 3   2 0 0
  4   0 0 0
  5   0 0 3
xtabs(~a + b)
   b
a   1 2
  1 0 3
  2 5 3
  3 1 1
  4 5 2
  5 2 3
x <- table(a, b)
addmargins(x)
     b
a      1  2 Sum
  1    0  3   3
  2    5  3   8
  3    1  1   2
  4    5  2   7
  5    2  3   5
  Sum 13 12  25
prop.table(table(a, b), 1) # proportions by rows
   b
a           1         2
  1 0.0000000 1.0000000
  2 0.6250000 0.3750000
  3 0.5000000 0.5000000
  4 0.7142857 0.2857143
  5 0.4000000 0.6000000
prop.table(table(a, b), 2)
   b
a            1          2
  1 0.00000000 0.25000000
  2 0.38461538 0.25000000
  3 0.07692308 0.08333333
  4 0.38461538 0.16666667
  5 0.15384615 0.25000000
# Correlations
set.seed(1)
n <- 1000
x1 <- rnorm(n, -1, 10)
x2 <- rnorm(n, 3, 2)
y <- 5 * x1 + x2 + rnorm(n, 1, 2)
cor(x1, x2)
[1] 0.006401211
cor.test(x1, x2)

    Pearson's product-moment correlation

data:  x1 and x2
t = 0.20223, df = 998, p-value = 0.8398
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.05561394  0.06836716
sample estimates:
        cor 
0.006401211 
cor(cbind(x1, x2, y))
            x1          x2          y
x1 1.000000000 0.006401211 0.99837564
x2 0.006401211 1.000000000 0.04731214
y  0.998375645 0.047312138 1.00000000
a <- rnorm(n)
b <- a^2 + rnorm(n)
plot(b~a)

plot(b ~ a, col = "gray")
curve((x), col = "red", add = TRUE)
curve((x^2), col = "blue", add = TRUE)

cor(a, b)
[1] -0.01712362
cor(a^2, b)
[1] 0.8430192
plot(b~I(a^2), col = "orange")
abline(lm(b~I(a^2)), col = "red")

layout(matrix(1:2, nrow = 1))
plot(b ~ a, col = "gray")
curve((x^2), col = "blue", add = TRUE)
plot(b ~ I(a^2), col = "gray")
curve((x), col = "blue", add = TRUE)

# Rounding
height <- c(167, 164, 172, 158, 181, 179)
mean(height)
[1] 170.1667
signif(mean(height), 4)
[1] 170.2
round(mean(height), 1)
[1] 170.2
round(mean(height), -2)
[1] 200
options(digits = 5)
sd(height)
[1] 8.8863
options(digits = 2)
sd(height)
[1] 8.9
options(scipen = -10)
10000000
[1] 1e+07
# sprintf
sprintf("%.3f", pi)
[1] "3.142"
sprintf("%05.1f", pi)
[1] "003.1"
# Plots as data summary
set.seed(1)
a <- rnorm(30)
hist(a, col = "gray20", border = "lightgray")

density(a)

Call:
    density.default(x = a)

Data: a (30 obs.);  Bandwidth 'bw' = 0.3891

       x                  y          
 Min.   :-3.4e+00   Min.   :0.0e+00  
 1st Qu.:-1.8e+00   1st Qu.:3.0e-02  
 Median :-3.0e-01   Median :9.0e-02  
 Mean   :-3.0e-01   Mean   :1.6e-01  
 3rd Qu.: 1.2e+00   3rd Qu.:2.9e-01  
 Max.   : 2.8e+00   Max.   :4.6e-01  
plot(density(a))

hist(a, freq = FALSE, col = "gray20", border = "lightgray")
lines(density(a), col = "red", lwd = 2)

b <- c(3, 4.5, 5, 8, 3, 6)
barplot(b, names.arg = letters[1:length(b)], horiz = F)

d <- rbind(c(2, 4, 1), c(6, 1, 3))
d
      [,1]  [,2]  [,3]
[1,] 2e+00 4e+00 1e+00
[2,] 6e+00 1e+00 3e+00
barplot(d, names.arg = letters[1:3])

barplot(d, beside = T)

layout(matrix(1:2, nrow = 1))
barplot(b, names.arg = letters[1:6], horiz = TRUE, las = 2)
dotchart(b, labels = letters[1:6], xlim = c(0, 8))

boxplot(a)

e <- rnorm(100, 1, 1)
f <- rnorm(100, 2, 4)
boxplot(e, f)

g1 <- c(e, f)
g2 <- rep(c(1, 2), each = 100)
boxplot(g1 ~ g2)

# Scatterplot
x1 <- rnorm(1000)
x2 <- rnorm(1000)
x3 <- x1 + x2
x4 <- x1 + x3
plot(x1, x2)

plot(x2~x1)

layout(matrix(1:3, nrow = 1))
plot(x1, x2)
plot(x1, x3)
plot(x1, x4)

pairs(~x1 + x2 + x3 + x4)

colors()[1:10]
 [1] "white"         "aliceblue"     "antiquewhite"  "antiquewhite1" "antiquewhite2"
 [6] "antiquewhite3" "antiquewhite4" "aquamarine"    "aquamarine1"   "aquamarine2"  
length(colors())
[1] 657
colors()[600]
[1] "slategray1"
set.seed(100)
z <- sample(1:4, 100, TRUE)
x <- rnorm(100)
y <- rnorm(100)
plot(x, y, pch = 15, col = c("red", "blue"))

c("red", "blue", "green", "orange")[z]
  [1] "blue"   "blue"   "green"  "red"    "blue"   "blue"   "orange" "blue"   "green" 
 [10] "red"    "green"  "orange" "blue"   "blue"   "orange" "green"  "red"    "blue"  
 [19] "blue"   "green"  "green"  "green"  "green"  "green"  "blue"   "red"    "orange"
 [28] "orange" "green"  "blue"   "blue"   "orange" "blue"   "orange" "green"  "orange"
 [37] "red"    "green"  "orange" "red"    "blue"   "orange" "orange" "orange" "green" 
 [46] "blue"   "orange" "orange" "red"    "blue"   "blue"   "red"    "red"    "blue"  
 [55] "green"  "blue"   "red"    "red"    "green"  "red"    "blue"   "green"  "orange"
 [64] "green"  "blue"   "blue"   "blue"   "blue"   "red"    "green"  "blue"   "blue"  
 [73] "green"  "orange" "green"  "green"  "orange" "orange" "orange" "red"    "blue"  
 [82] "green"  "orange" "orange" "red"    "green"  "green"  "red"    "blue"   "green" 
 [91] "orange" "red"    "blue"   "blue"   "orange" "blue"   "green"  "red"    "red"   
[100] "orange"
plot(x, y, pch = 15, col = c("red", "blue", "green", "orange")[z]) #indexing colors on z groups

# Analysis of variance (ANOVA)
set.seed(100)
tr <- rep(1:4, each = 30)
y <- numeric(length = 120)
y[tr == 1] <- rnorm(30, 5, 1)
y[tr == 2] <- rnorm(30, 4, 2)
y[tr == 3] <- rnorm(30, 4, 5)
y[tr == 4] <- rnorm(30, 1, 2)
aov(y~tr)
Call:
   aov(formula = y ~ tr)

Terms:
                     tr Residuals
Sum of Squares  2.5e+02   1.2e+03
Deg. of Freedom 1.0e+00   1.2e+02

Residual standard error: 3.2e+00
Estimated effects may be unbalanced
summary(aov(y ~ factor(tr)))
                  Df   Sum Sq Mean Sq  F value  Pr(>F)    
factor(tr)  3.00e+00 2.97e+02 9.9e+01 9.98e+00 6.6e-06 ***
Residuals   1.16e+02 1.15e+03 9.9e+00                     
---
Signif. codes:  0e+00 ‘***’ 1e-03 ‘**’ 1e-02 ‘*’ 5e-02 ‘.’ 1e-01 ‘ ’ 1e+00
oneway.test(y ~ tr)

    One-way analysis of means (not assuming equal variances)

data:  y and tr
F = 4e+01, num df = 3e+00, denom df = 5e+01, p-value = 1e-13
oneway.test(y ~ factor(tr), var.equal = TRUE)

    One-way analysis of means

data:  y and factor(tr)
F = 1e+01, num df = 3e+00, denom df = 1e+02, p-value = 7e-06
by(y, tr, FUN = mean)
tr: 1
[1] 5e+00
------------------------------------------------------------------------------------ 
tr: 2
[1] 4.2e+00
------------------------------------------------------------------------------------ 
tr: 3
[1] 3.8e+00
------------------------------------------------------------------------------------ 
tr: 4
[1] 8.5e-01
tapply(y, tr, FUN = mean) # same thing
      1       2       3       4 
5.0e+00 4.2e+00 3.8e+00 8.5e-01 
# Distributions
options(scipen = F)
options(digits = 5)
dnorm(0)  # density
[1] 0.39894
dnorm(0, mean=-1)
[1] 0.24197
pnorm(0)  # cumulative
[1] 0.5
pnorm(1.65) # 95% normal confidence interval
[1] 0.95053
pnorm(1.96)
[1] 0.975
pnorm(1.96) - pnorm(-1.96)
[1] 0.95
qnorm(c(0.025, 0.975))  # quantile
[1] -1.96  1.96
pnorm(qnorm(c(0.025, 0.975)))
[1] 0.025 0.975
# other distribution
dbinom(0, 1, 0.5)
[1] 0.5
pbinom(0, 1, 0.5)
[1] 0.5
qbinom(.95, 1, 0.5)
[1] 1
# Formulae
myformula <- ~x
class(myformula)
[1] "formula"
# interactions
y ~ x1 * x2
y ~ x1 * x2
# As strings
("y ~ x") == (y ~ x)
[1] TRUE
as.formula("y~x")
y ~ x
as.character(y ~ x)
[1] "~" "y" "x"
terms(y ~ x1 + x2)
y ~ x1 + x2
attr(,"variables")
list(y, x1, x2)
attr(,"factors")
   x1 x2
y   0  0
x1  1  0
x2  0  1
attr(,"term.labels")
[1] "x1" "x2"
attr(,"order")
[1] 1 1
attr(,"intercept")
[1] 1
attr(,"response")
[1] 1
attr(,".Environment")
<environment: R_GlobalEnv>
update(y ~ x, ~. + x2)
y ~ x + x2
update(y ~ x, z ~ .)
z ~ x
# Bivariate Regression
set.seed(1)
bin <- rbinom(1000, 1, 0.5)
out <- 2 * bin + rnorm(1000)
by(out, bin, mean)
bin: 0
[1] -0.015881
--------------------------------------------------------------------- 
bin: 1
[1] 1.9662
t.test(out ~ bin)

    Welch Two Sample t-test

data:  out by bin
t = -30.3, df = 993, p-value <2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -2.1105 -1.8537
sample estimates:
mean in group 0 mean in group 1 
      -0.015881        1.966237 
lm(out ~ bin)

Call:
lm(formula = out ~ bin)

Coefficients:
(Intercept)          bin  
    -0.0159       1.9821  
summary(lm(out~bin))$coef
             Estimate Std. Error  t value    Pr(>|t|)
(Intercept) -0.015881   0.045341 -0.35027  7.2621e-01
bin          1.982119   0.065444 30.28724 1.9487e-143
plot(out ~ bin, col = "gray")
points(0:1, by(out, bin, mean), col = "blue", bg = "blue", pch = 23)
abline(coef(lm(out ~ bin)), col = "blue")

set.seed(1)
x <- runif(1000, 0, 10)
y <- 3 * x + rnorm(1000, 0, 5)
# glm plots
set.seed(1)
n <- 100
x <- runif(n, 0, 1)
y <- rbinom(n, 1, x)
plot(y ~ x, col = NULL, bg = rgb(0, 0, 0, 0.5), pch = 21) # bg: background color
abline(lm(y ~ x), lwd = 2) # lwd: line width (default: 1)

m1 <- glm(y ~ x, family = binomial(link = "logit"))
newdf <- data.frame(x = seq(0, 1, length.out = 100))
newdf
newdf$pout_logit <- predict(m1, newdf, se.fit = TRUE, type = "response")$fit
newdf[95:100,]
# build confidence intervals from standard error
newdf$pse_logit <- predict(m1, newdf, se.fit = TRUE, type = "response")$se.fit
newdf$plower_logit <- newdf$pout_logit - (1.96 * newdf$pse_logit)  # 95% CI lower bound
newdf$pupper_logit <- newdf$pout_logit + (1.96 * newdf$pse_logit)  # 95% CI upper bound
# qnorm(c(0.025, 0.975)) = (-1.96, +1.96)
newdf[1:10,c(1,2,3,5,4)]
Error: object 'newdf' not found
# now plot predicted values
with(newdf, plot(pout_logit ~ x, type = "l", lwd = 2))
with(newdf, lines(pupper_logit ~ x, type = "l", lty = 2))
with(newdf, lines(plower_logit ~ x, type = "l", lty = 2))

head(mpg)
#g <- ggplot(data = mpg, aes(x = displ, y = hwy))
ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point()

ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point() + geom_smooth(method = "lm") # with regression

ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point(aes(color = class))

ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point(aes(size = class))

ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point(aes(shape = class))

ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point(aes(alpha = class))

ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point() + facet_grid(. ~ cyl)

ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point() + facet_grid(drv ~ .)

ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point() + facet_grid(drv ~ cyl)

ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point() + facet_wrap( ~ class)

ggplot(data = mpg, aes(x = displ, y = hwy)) +  geom_point() + geom_smooth()

ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point() + geom_smooth(se = FALSE)  # Turn off confidence band

ggplot(data = mpg, aes(x = class, y = hwy)) + geom_boxplot() # scatterplot

ggplot(data = mpg, aes(x = reorder(class, hwy), y = hwy)) + geom_boxplot() # reorder (mean)

ggplot(data = mpg, aes(x = reorder(class, hwy, median), y = hwy)) + geom_boxplot() # reorder by median

# jitter
ggplot(data = mpg, aes(x = cty, y = hwy)) + geom_point()

ggplot(data = mpg, aes(x = cty, y = hwy)) + geom_point(position = "jitter")

ggplot(data = mpg, aes(x = cty, y = hwy)) + geom_jitter() # identical option

names(diamonds) # part of ggplot2 package
 [1] "carat"   "cut"     "color"   "clarity" "depth"   "table"   "price"   "x"       "y"       "z"      
ggplot(data = diamonds,aes(x =cut)) + geom_bar(aes(fill =cut)) # fill: color inside of bars

ggplot(data = diamonds, aes(x =cut)) + geom_bar(aes(color =cut)) # color: line around the bars

ggplot(data = diamonds, aes(x = color)) + geom_bar(aes(fill = cut), position = "dodge")

str(diamonds)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame':   53940 obs. of  10 variables:
 $ carat  : num  0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
 $ cut    : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
 $ color  : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
 $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
 $ depth  : num  61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
 $ table  : num  55 61 65 58 58 57 57 55 61 61 ...
 $ price  : int  326 326 327 334 335 336 336 337 337 338 ...
 $ x      : num  3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
 $ y      : num  3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
 $ z      : num  2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
ggplot(data = diamonds, aes(x = carat)) + geom_histogram(binwidth = 1)

ggplot(data = diamonds, aes(x = carat)) + geom_histogram(binwidth = 0.01)

ggplot(data = diamonds, aes(x = carat)) + geom_histogram() #stat_bin: binwidth defaulted to range/30.

zoom <- coord_cartesian(xlim = c(55, 70))
ggplot(data = diamonds, aes(x = depth)) + geom_histogram(binwidth = 0.2) + zoom

ggplot(data = diamonds, aes(x = depth)) + geom_histogram(aes(fill = cut), binwidth = 0.1) + zoom

ggplot(data = diamonds, aes(x = depth)) + geom_density(aes(fill = cut)) + zoom

ggplot(data = diamonds, aes(x = depth)) + geom_density(aes(color = cut, fill = cut, alpha=0.1)) + zoom

ggplot(data = diamonds, aes(x = price)) +geom_histogram(aes(fill = cut), binwidth = 100)

ggplot(data = diamonds, aes(x = price)) + geom_density(aes(color= cut))

# Visualization of big data
ggplot(data = diamonds, aes(x = carat, y = price)) + geom_point(aes(color = cut)) # not helpful

ggplot(data = diamonds, aes(x = carat, y = price)) + geom_bin2d()

ggplot(data = diamonds, aes(x = carat, y = price)) + geom_density2d()

ggplot(data = diamonds, aes(x = carat, y = price)) + geom_point() + geom_density2d()

ggplot(data = diamonds, aes(x = carat, y = price)) + geom_smooth(aes(group = cut))
ggplot(data = diamonds, aes(x = carat, y = price)) + geom_smooth(aes(color = cut), method = "loess", se=F)

# ggsave("my-plot.pdf") # PDF more crisp
# ggsave("my-plot.png")
# ggsave("my-plot.pdf", width = 6, height = 6)
# ggsave("my-plot.png", width = 6, height = 6)
head(births, 3)

head(bnames, 3)
# Simple plot
Quentin <- bnames[bnames$name == "Letitia", ] 
qplot(year, prop, data = Quentin, geom = "line")

# Interactions
michaels <- bnames[bnames$name == "Quentin" | bnames$name == "Alexis" | bnames$name == "Gina", ]
qplot(year, prop, data = michaels, geom = "line", color = interaction(sex, name))

# dplyr
library(dplyr)
bnames = tbl_df(bnames) 
births = tbl_df(births) 
class(bnames)
[1] "tbl_df"     "tbl"        "data.frame"
# filter
vivian = filter(bnames, name == "Vivian")
filter(bnames, sex == "girl" & (year == 1900 | year == 2000))
# Select columns
select(bnames, starts_with("sound"))
# Rename column
rename(iris, petal_length = Petal.Length)
# sort
arrange(bnames, desc(prop))[3,]
# year where Quentin was most popular
arrange(filter(bnames, name == "Quentin"), desc(prop))[1,]
# add columns
mutate(births, double = 2 * births)
# transmute to delete old columns
# Summarize
summarise(vivian,min = min(prop),mean = mean(prop), max = max(prop), number = n(), number_distinct = n_distinct(prop))
bnames2 = left_join(bnames, births, by = c("year","sex"))
bnames2 = mutate(bnames2, n = round(prop * births))
# Group
by_name = group_by(bnames2, name)
by_name
totals = summarise(by_name, total = sum(n)) 
totals
bnames2
name_sex = group_by(bnames2, name, year) 
totals2 = summarise(name_sex, total = sum(n)) 
head(totals2)
arrange(bnames2, name)[1:3,]
ungroup(by_name)
# other example
year_sex = group_by(bnames2, year, sex) 
ytotals = summarise(year_sex, births = sum(n)) 
ytotals
# ISLR 
# Chap 6 Ridge regression / Lasso
# Ex 8
set.seed(1)
X = rnorm(100)
eps = rnorm(100)
beta0 = 3
beta1 = 2
beta2 = -3
beta3 = 0.3
Y = beta0 + beta1 * X + beta2 * X^2 + beta3 * X^3 + eps
# Use regsubsets to select best model having polynomial of XX of degree 10
library(leaps)
data.full = data.frame(y = Y, x = X)
mod.full = regsubsets(y ~ poly(x, 10, raw = T), data = data.full, nvmax = 10)
mod.summary = summary(mod.full)
# Find the model size for best cp, BIC and adjr2
which.min(mod.summary$cp)
[1] 3
which.min(mod.summary$bic)
[1] 3
which.max(mod.summary$adjr2)
[1] 3
# Plot cp, BIC and adjr2
plot(mod.summary$cp, xlab = "Subset Size", ylab = "Cp", pch = 20, type = "l")
points(3, mod.summary$cp[3], pch = 4, col = "red", lwd = 7)

plot(mod.summary$bic, xlab = "Subset Size", ylab = "BIC", pch = 20, type = "l")
points(3, mod.summary$bic[3], pch = 4, col = "red", lwd = 7)

plot(mod.summary$adjr2, xlab = "Subset Size", ylab = "Adjusted R2", pch = 20, 
    type = "l")
points(3, mod.summary$adjr2[3], pch = 4, col = "red", lwd = 7)

# Coefs found by regression (replaces x^3 by x^7)
coefficients(mod.full, id = 3)
          (Intercept) poly(x, 10, raw = T)1 poly(x, 10, raw = T)2 poly(x, 10, raw = T)7 
           3.07627412            2.35623596           -3.16514887            0.01046843 
# Now lasso
library(glmnet)
xmat = model.matrix(y ~ poly(x, 10, raw = T), data = data.full)[, -1] # remove intercept (first column)
mod.lasso = cv.glmnet(xmat, Y, alpha = 1) # cv.glmnet: cross validation to select best lambda
best.lambda = mod.lasso$lambda.min
best.lambda
[1] 0.03991416
plot(mod.lasso)

# Next fit the model on entire data using best lambda
best.model = glmnet(xmat, Y, alpha = 1)
predict(best.model, s = best.lambda, type = "coefficients")
11 x 1 sparse Matrix of class "dgCMatrix"
                                   1
(Intercept)             3.0398151056
poly(x, 10, raw = T)1   2.2303371338
poly(x, 10, raw = T)2  -3.1033192679
poly(x, 10, raw = T)3   .           
poly(x, 10, raw = T)4   .           
poly(x, 10, raw = T)5   0.0498410763
poly(x, 10, raw = T)6   .           
poly(x, 10, raw = T)7   0.0008068431
poly(x, 10, raw = T)8   .           
poly(x, 10, raw = T)9   .           
poly(x, 10, raw = T)10  .           
#Lasso also picks X^5 over X^3. It also picks X^7 with negligible coefficient.
# ISLR 
# Chap 7 Non-linear Modeling - Splines, GAM
library(ISLR)
attach(Wage)
The following objects are masked from Wage (pos = 3):

    age, education, health, health_ins, jobclass, logwage, maritl, race, region, wage, year
fit=lm(wage~poly(age,4,raw=T),data=Wage)  # raw=T to obtain coefficients of poly directly
coef(summary(fit)) # below we see small coef (and p-value) for order 3 & 4
                            Estimate   Std. Error   t value     Pr(>|t|)
(Intercept)            -1.841542e+02 6.004038e+01 -3.067172 0.0021802539
poly(age, 4, raw = T)1  2.124552e+01 5.886748e+00  3.609042 0.0003123618
poly(age, 4, raw = T)2 -5.638593e-01 2.061083e-01 -2.735743 0.0062606446
poly(age, 4, raw = T)3  6.810688e-03 3.065931e-03  2.221409 0.0263977518
poly(age, 4, raw = T)4 -3.203830e-05 1.641359e-05 -1.951938 0.0510386498
# Equivalent expressions
fit2a=lm(wage~age+I(age^2)+I(age^3)+I(age^4),data=Wage)
coef(fit2a)
  (Intercept)           age      I(age^2)      I(age^3)      I(age^4) 
-1.841542e+02  2.124552e+01 -5.638593e-01  6.810688e-03 -3.203830e-05 
fit2b=lm(wage~cbind(age,age^2,age^3,age^4),data=Wage) # cbind in formulas <-> I() wrapper
coef(fit2b)
                       (Intercept) cbind(age, age^2, age^3, age^4)age    cbind(age, age^2, age^3, age^4) 
                     -1.841542e+02                       2.124552e+01                      -5.638593e-01 
   cbind(age, age^2, age^3, age^4)    cbind(age, age^2, age^3, age^4) 
                      6.810688e-03                      -3.203830e-05 
# Prediction
agelims=range(age)
age.grid=seq(from=agelims[1],to=agelims[2])
preds=predict(fit,newdata=list(age=age.grid),se=TRUE) # creates list of ages where we want prediction (see below)
se.bands=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
par(mfrow=c(1,2),mar=c(4.5,4.5,1,1),oma=c(0,0,4,0)) # mar/ oma: margins of plot
plot(age,wage,xlim=agelims,cex=.5,col="darkgrey")
title("Degree-4 Polynomial",outer=T)
lines(age.grid,preds$fit,lwd=2,col="blue")
matlines(age.grid,se.bands,lwd=1,col="blue",lty=3)

age.grid=seq(from=agelims[1],to=agelims[2]) 
list(age=age.grid) # $age list
$age
 [1] 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
[34] 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
predict(fit,list(age=c(70))) # test for 1 age - still need to create an $age list
       1 
106.9478 
preds
$fit
        1         2         3         4         5         6         7         8         9        10 
 51.93145  58.49674  64.57188  70.18273  75.35440  80.11122  84.47676  88.47380  92.12437  95.44973 
       11        12        13        14        15        16        17        18        19        20 
 98.47038 101.20601 103.67560 105.89731 107.88856 109.66599 111.24548 112.64214 113.87029 114.94351 
       21        22        23        24        25        26        27        28        29        30 
115.87459 116.67557 117.35770 117.93148 118.40662 118.79210 119.09608 119.32598 119.48846 119.58939 
       31        32        33        34        35        36        37        38        39        40 
119.63388 119.62627 119.57013 119.46827 119.32271 119.13473 118.90481 118.63269 118.31733 117.95691 
       41        42        43        44        45        46        47        48        49        50 
117.54885 117.08980 116.57566 116.00152 115.36174 114.64988 113.85877 112.98043 112.00613 110.92638 
       51        52        53        54        55        56        57        58        59        60 
109.73090 108.40865 106.94784 105.33588 103.55943 101.60437  99.45583  97.09815  94.51491  91.68893 
       61        62        63 
 88.60223  85.23611  81.57105 

$se.fit
        1         2         3         4         5         6         7         8         9        10 
 5.298268  4.370763  3.592101  2.955813  2.455588  2.083260  1.825676  1.662329  1.566864  1.512533 
       11        12        13        14        15        16        17        18        19        20 
 1.477572  1.447355  1.413769  1.373549  1.326690  1.275230  1.222382  1.171865  1.127324  1.091786 
       21        22        23        24        25        26        27        28        29        30 
 1.067171  1.053981  1.051263  1.056899  1.068100  1.081939  1.095809  1.107729  1.116537  1.121992 
       31        32        33        34        35        36        37        38        39        40 
 1.124827  1.126747  1.130363  1.139015  1.156439  1.186260  1.231415  1.293640  1.373218  1.469069 
       41        42        43        44        45        46        47        48        49        50 
 1.579082  1.700568  1.830715  1.966988  2.107496  2.251362  2.399124  2.553187  2.718307  2.902040 
       51        52        53        54        55        56        57        58        59        60 
 3.115018  3.370894  3.685814  4.077417  4.563601  5.161438  5.886530  6.752911  7.773327  8.959688 
       61        62        63 
10.323512 11.876272 13.629642 

$df
[1] 2995

$residual.scale
[1] 39.91479
# Use of ANOVA
fit.1=lm(wage~age,data=Wage)
fit.2=lm(wage~poly(age,2),data=Wage)
fit.3=lm(wage~poly(age,3),data=Wage)
fit.4=lm(wage~poly(age,4),data=Wage)
fit.5=lm(wage~poly(age,5),data=Wage)
anova(fit.1,fit.2,fit.3,fit.4,fit.5) # Analysis of variance
Analysis of Variance Table

Model 1: wage ~ age
Model 2: wage ~ poly(age, 2)
Model 3: wage ~ poly(age, 3)
Model 4: wage ~ poly(age, 4)
Model 5: wage ~ poly(age, 5)
  Res.Df     RSS Df Sum of Sq        F    Pr(>F)    
1   2998 5022216                                    
2   2997 4793430  1    228786 143.5931 < 2.2e-16 ***
3   2996 4777674  1     15756   9.8888  0.001679 ** 
4   2995 4771604  1      6070   3.8098  0.051046 .  
5   2994 4770322  1      1283   0.8050  0.369682    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# Or obtain these p-values by exploiting the fact that poly() creates orthogonal polynomials:
coef(summary(fit.5)) # and check P-values for coef of each order
                Estimate Std. Error     t value     Pr(>|t|)
(Intercept)    111.70361  0.7287647 153.2780243 0.000000e+00
poly(age, 5)1  447.06785 39.9160847  11.2001930 1.491111e-28
poly(age, 5)2 -478.31581 39.9160847 -11.9830341 2.367734e-32
poly(age, 5)3  125.52169 39.9160847   3.1446392 1.679213e-03
poly(age, 5)4  -77.91118 39.9160847  -1.9518743 5.104623e-02
poly(age, 5)5  -35.81289 39.9160847  -0.8972045 3.696820e-01
# Generalized linear function
fit=glm(I(wage>250)~poly(age,4),data=Wage,family=binomial) # plus create binary response on the fly with I()
preds=predict(fit,newdata=list(age=age.grid),se=T) # default: type="link" = predictions for logit
# other option type="response" (probability vs odds)
# Step function
table(cut(age,4))

(17.9,33.5]   (33.5,49]   (49,64.5] (64.5,80.1] 
        750        1399         779          72 
fit=lm(wage~cut(age,4),data=Wage) # cut() returns an ordered categorical variable. Breaks= can be used 
coef(summary(fit))
                        Estimate Std. Error   t value     Pr(>|t|)
(Intercept)            94.158392   1.476069 63.789970 0.000000e+00
cut(age, 4)(33.5,49]   24.053491   1.829431 13.148074 1.982315e-38
cut(age, 4)(49,64.5]   23.664559   2.067958 11.443444 1.040750e-29
cut(age, 4)(64.5,80.1]  7.640592   4.987424  1.531972 1.256350e-01
fit.1=lm(wage~education+age,data=Wage)
fit.2=lm(wage~education+poly(age,2),data=Wage)
fit.3=lm(wage~education+poly(age,3),data=Wage)
anova(fit.1,fit.2,fit.3)
Analysis of Variance Table

Model 1: wage ~ education + age
Model 2: wage ~ education + poly(age, 2)
Model 3: wage ~ education + poly(age, 3)
  Res.Df     RSS Df Sum of Sq        F Pr(>F)    
1   2994 3867992                                 
2   2993 3725395  1    142597 114.6969 <2e-16 ***
3   2992 3719809  1      5587   4.4936 0.0341 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# Splines
library(splines)
fit=lm(wage~bs(age,knots=c(25,40,60)),data=Wage) # 3 knots --> 7 degrees of freedom --> 1 intercept + 6 basis functions
pred=predict(fit,newdata=list(age=age.grid),se=T)
plot(age,wage,col="gray")
lines(age.grid,pred$fit,lwd=2)
lines(age.grid,pred$fit+2*pred$se,lty="dashed")
lines(age.grid,pred$fit-2*pred$se,lty="dashed")

pred=predict(fit,se=T)
plot(age,wage,col="gray")
lines(age,pred$fit,lwd=2) # too many points: that's why we changed age into a list of all unique ages

# That works too - just need to have same length of data in predict() and lines()
age.grid2=sort(unique(age))
pred2=predict(fit,newdata=list(age=age.grid2),se=T)
plot(age,wage,col="gray")
lines(age.grid2,pred2$fit,lwd=2)

str(pred2)
List of 4
 $ fit           : Named num [1:61] 60.5 62.7 65.8 69.6 73.8 ...
  ..- attr(*, "names")= chr [1:61] "1" "2" "3" "4" ...
 $ se.fit        : Named num [1:61] 9.46 5.63 3.78 3.33 3.22 ...
  ..- attr(*, "names")= chr [1:61] "1" "2" "3" "4" ...
 $ df            : int 2993
 $ residual.scale: num 39.9
dim(bs(age,knots=c(25,40,60), degree = 3)) # bs generates matrix with 6 basis functions (see below)
[1] 3000    6
dim(bs(age,df=6)) # df option to produce a spline with knots at uniform quantiles of the data
[1] 3000    6
attr(bs(age,df=6),"knots")
  25%   50%   75% 
33.75 42.00 51.00 
# Natural spline: ns()
fit2=lm(wage~ns(age,df=4),data=Wage)
pred2=predict(fit2,newdata=data.frame(age=age.grid),se=T)
plot(age,wage,xlim=agelims,cex=.5,col="darkgrey")
lines(age.grid, pred2$fit,col="red",lwd=2)

# Smooth spline
plot(age,wage,xlim=agelims,cex=.5,col="darkgrey")
title("Smoothing Spline")
fit=smooth.spline(age,wage,df=16) # forces 16 degrees of freedom
fit2=smooth.spline(age,wage,cv=TRUE) # choose value of lambda by cross validation --> 6.8 degrees of freedom
cross-validation with non-unique 'x' values seems doubtful
fit2$df
[1] 6.794596
lines(fit,col="red",lwd=2)
lines(fit2,col="blue",lwd=2)
legend("topright",legend=c("16 DF","6.8 DF"),col=c("red","blue"),lty=1,lwd=2,cex=.8)

# Local regresssion (Loess)
plot(age,wage,xlim=agelims,cex=.5,col="darkgrey")
title("Local Regression")
fit=loess(wage~age,span=.2,data=Wage) # span=0.2 means use 20% of observations
fit2=loess(wage~age,span=.5,data=Wage)
lines(age.grid,predict(fit,data.frame(age=age.grid)),col="red",lwd=2)
lines(age.grid,predict(fit2,data.frame(age=age.grid)),col="blue",lwd=2)
legend("topright",legend=c("Span=0.2","Span=0.5"),col=c("red","blue"),lty=1,lwd=2,cex=.8)

---
title: "R Notebook"
output:
  html_document: default
  html_notebook: default
---

```{r}
A <- 1:5
B <- 11:15
names(A) <- A
names(B) <- B
```

```{r}
A
B
```

```{r}
View(anscombe)
lm(y3~x3, data = anscombe)
```

```{r}
##-- now some "magic" to do the 4 regressions in a loop:
ff <- y ~ x
mods <- setNames(as.list(1:4), paste0("lm", 1:4))
for(i in 1:4) {
  ff[2:3] <- lapply(paste0(c("y","x"), i), as.name)
  ## or   ff[[2]] <- as.name(paste0("y", i))
  ##      ff[[3]] <- as.name(paste0("x", i))
  mods[[i]] <- lmi <- lm(ff, data = anscombe)
  print(anova(lmi))
}
```

```{r}
## See how close they are (numerically!)
sapply(mods, coef)
lapply(mods, function(fm) coef(summary(fm)))
```

```{r}
# Aggregate function
#Splits the data into subsets, computes summary statistics for each, and returns the result in a convenient form
testDF <- data.frame(v1 = c(1,3,5,7,8,3,5,NA,4,5,7,9),
                     v2 = c(11,33,55,77,88,33,55,NA,44,55,77,99) )
by1 <- c("red", "blue", 1, 2, NA, "big", 1, 2, "red", 1, NA, 12)
by2 <- c("wet", "dry", 99, 95, NA, "damp", 95, 99, "red", 99, NA, NA)
aggregate(x = testDF, by = list(by1, by2), FUN = "mean")

# or from Titanic Kagg
#aggregate(Survived ~ Fare2 + Pclass + Sex, data=train, FUN=function(x) {sum(x)/length(x)})
```

```{r}
## Now, do what you should have done in the first place: PLOTS
op <- par(mfrow = c(2, 2), mar = 0.1+c(4,4,1,1), oma =  c(0, 0, 2, 0))
for(i in 1:4) {
  ff[2:3] <- lapply(paste0(c("y","x"), i), as.name)
  plot(ff, data = anscombe, col = "red", pch = 21, bg = "orange", cex = 1.2,
       xlim = c(3, 19), ylim = c(3, 13))
  abline(mods[[i]], col = "blue")
}
mtext("Anscombe's 4 Regression data sets", outer = TRUE, cex = 1.5)
par(op)
```

```{r}
x = c(TRUE, FALSE, FALSE)
typeof(x)  #logical (vector)
mode(x)
storage.mode(x)
y = 1:10
typeof(y)
mode(y)
storage.mode(y)
```

```{r}
dim(y)
length(y)
```

```{r}
z= list(1, TRUE, 'safs') #trying to get a list
typeof(z)
class(z)
z[3]
length(z)
dim(z)

```

```{r}
quote(x+y)
as.list(quote(x + y))
```

```{r}
1e3L #create constant
1L
```

```{r}
cat(1+2)
'+'(1, 2)
```

```{r}
x <- options()
x$prompt
```

```{r}
match(NA, NaN)
match(NA, NA)
match(NaN, NaN)
```

```{r}
x = array(1:8, c(2,4))
x
i=2
j=3
x[i]
x[i, j]
x[[i]]
x[[i, j]]
```

```{r}
rownames(x)=c("a","b")
x
x = as.data.frame(x)
x
x["a",]
x[]
```

```{r}
i <- matrix(1:4, 2, byrow = TRUE)
i
```

```{r}
i[2,]
i[2, ,drop=FALSE] # keeping dimension 1 * n when selection a row
dim(i[2,])
dim(i[2, ,drop=FALSE])
```

```{r}
# https://cran.r-project.org/doc/manuals/R-intro.pdf

help.start()
```

```{r}
x <- rnorm(10000)
y <- rnorm(x)
plot(x, y)
hist(y)
```

```{r}
ls()
```

```{r}
rm(list=ls())
ls()
```

```{r}
x <- 1:20
w <- 1 + sqrt(x)/2
dummy <- data.frame(x=x, y= x + rnorm(x)*w)
dummy 
```

```{r}
# 4 Ordered and unordered factors
state <- c("tas", "sa", "qld", "nsw", "nsw", "nt", "wa", "wa",
"qld", "vic", "nsw", "vic", "qld", "qld", "sa", "tas",
"sa", "nt", "wa", "vic", "qld", "nsw", "nsw", "wa",
"sa", "act", "nsw", "vic", "vic", "act")
statef <- factor(state)
statef
```

```{r}
levels(statef)
```

```{r}
incomes <- c(60, 49, 40, 61, 64, 60, 59, 54, 62, 69, 70, 42, 56,
61, 61, 61, 58, 51, 48, 65, 49, 49, 41, 48, 52, 46,
59, 46, 58, 43)
incmeans <- tapply(incomes, statef, mean)
incmeans
```

```{r}
# Arrays
a <- array(1:30, dim=c(2, 5,3))
a
```

```{r}
x <- array(1:20, dim=c(4,5)) # Generate a 4 by 5 array.
x
i <- array(c(1:3,3:1), dim=c(3,2))
i
x[i] #get the 3 elements shown by i: (3, 1), (2, 2) and (1, 3)
```

```{r}
help("<-")
```

```{r}
# Back to http://zoonek2.free.fr/UNIX/48_R/02.html
x <- rnorm(10)
x
sort(x)
order(x)
x[order(x)]
```

```{r}
x <- sample(1:5, 10, replace=T)
x
x[order(x)]
unique(x)
```

```{r}
seq(0,10, length=11) == seq(0,10, by=1)
```

```{r}
# Rep
rep(0, 10)
rep(1:5,3)
rep(1:5, each=3)
rep(1:5,2,each=3)
```

```{r}
# Factor
x <- factor( sample(c("Yes", "No", "Perhaps"), 5, replace=T) )
x
```

```{r}
# specify levels
l <- c("Yes", "No", "Perhaps")
x <- factor( sample(l, 5, replace=T), levels=l )
x
```

```{r}
table(x)
```

```{r}
# gl: Generate Factor Levels
gl(1,4)
gl(2,4)
gl(2,1,8)
gl(2,1,8, labels=c(T,F))
```

```{r}
x <- gl(2,4)
x
y <- gl(2,1,8)
y
interaction(x,y)
data.frame(x,y, int=interaction(x,y))
```

```{r}
# Cartesian product (toutes les possibilites)
x <- c("A", "B", "C")
y <- 1:2
z <- c("a", "b")
expand.grid(x,y,z)
```

```{r}
x <- factor(c(3,4,5,1))
as.numeric( levels(x))
as.numeric( levels(x)[ x ] ) # proper way to convert from factor to numeric
```

```{r}
# Data Frames
n <- 10
df <- data.frame( x=rnorm(n), y=sample(c(T,F),n,replace=T) )
str(df)
```

```{r}
summary(df)
```

```{r}
names(df);cat(rownames(df))
```

```{r}
# Merge
merge(x, y)                 # INNER JOIN
merge(x, y, all.x = TRUE)   # LEFT JOIN
merge(x, y, all.y = TRUE)   # RIGHT JOIN
merge(x, y, all   = TRUE)   # OUTER JOIN
#merge(a, b, by=c("y", "z")) # specify what to merge on
```

```{r}
# Regression
data(cars) 
#View(cars)

# Regression
lm.fit=lm( dist ~ speed, data=cars, na.action = na.exclude)
lm.fit
# Polynomial regression
lm.fit3 = lm( dist ~ poly(speed,3), data=cars)

#plot(lm.fit)
plot(cars$speed, cars$dist)
abline(lm.fit)
```

```{r}
# Trees
# install.packages('rattle')
# install.packages('rpart.plot')
# install.packages('RColorBrewer')
library(rattle)
library(rpart.plot)
library(RColorBrewer)
```

```{r}
# http://trevorstephens.com/kaggle-titanic-tutorial/r-part-5-random-forests/
# fit <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked,
#                data=train,
#                method="class", 
#                control=rpart.control(minsplit=2, cp=0))
# fancyRpartPlot(fit)

# Predict a continuous variable (method anova)
# Agefit <- rpart(Age ~ Pclass + Sex + SibSp + Parch + Fare + Embarked + Title + FamilySize,
#                   data=combi[!is.na(combi$Age),], 
#                   method="anova")
```

```{r}
# Lists
h <- list()
h[["foo"]] <- 1
h[["bar"]] <- c("a", "b", "c")
h["bar"] == h[["bar"]] #h["bar"] is a list containing the vector
```

```{r}
# Delete
h[["bar"]] <- NULL
```

```{r}
m <- matrix( c(1,2,3,4), nrow=2 )
m
```

```{r}
solve(m)
x <- matrix( c(6,7), nrow=2 )
solve(m, x)
```

```{r}
?solve
```

```{r}
n <- 1000
x1 <- factor( sample(1:3, n, replace=T), levels=1:3 )
x2 <- factor( sample(LETTERS[1:5], n, replace=T), levels=LETTERS[1:5] )
x3 <- factor( sample(c(F,T),n,replace=T), levels=c(F,T) )
d <- data.frame(x1,x2,x3)
r <- table(d)
```

```{r}
r
```

```{r}
ftable(d) #easier reading
```

```{r}
# contingency table into a data.frame
n <- 100
k <- 10
x <- factor( sample(LETTERS[1:k], n, replace=T), levels=LETTERS[1:k] )
x
d <- table(x)
x2 = factor( rep(names(d),d), levels=names(d) )
x2
```

```{r}
# apply
options(digits=4)
df <- data.frame(x=rnorm(20),y=rnorm(20),z=rnorm(20))
apply(df,2,mean)
rownames(df) <- LETTERS[1:20]
apply(df, 1, mean)
```

```{r}
gl(2,10,20)
tapply(1:20, gl(2,10,20), sum) # tapply: 2nd argument used for grouping

by(1:20, gl(2,10,20), sum)
```

```{r}
x <- list(a=rnorm(10), b=runif(100), c=rgamma(50,1))
sapply(x,sd) # sapply: apply FUN on each element of vector
lapply(x,sd) # lapply: same but returns list

```

```{r}
# Exercise: Let x be a boolean vector. Count the number of sequences ("runs") of zeros (for instance, in 00101001010110, there are 6 runs: 00 0 00 0 0 0). Count the number of sequences of 1. Counth the total number of sequences. Same question for a factor with more tham two levels.
n <- 50
x <- sample(0:1, n, replace=T, p=c(.2,.8))
x
diff(x, lag=1)


```

```{r}
#Let r be the return of a financial asset. The clustered return is the accumulated return for a sequence of returns of the same sign. The trend number is the number of steps in such a sequence. The average return is their ratio. Compute these quantities.
data(EuStockMarkets)
x <- EuStockMarkets
# We aren't interested in the spot prices, but in the returns
# return[i] = ( price[i] - price[i-1] ) / price[i-1]
y <- apply(x, 2, function (x) { diff(x)/x[-length(x)] })
# We normalize the data
z <- apply(y, 2, function (x) { (x-mean(x))/sd(x) })
# A single time series
r <- z[,1]
# The runs
f <- factor(cumsum(abs(diff(sign(r))))/2)
r <- r[-1]
accumulated.return <- tapply(r, f, sum)
trend.number <- table(f)
boxplot(abs(accumulated.return) ~ trend.number, col='pink',
        main="Accumulated return")
```

```{r}
# Strings
print("Hello\n")

cat("Hello\n") #use cat
```

```{r}
paste("Hello", "World", "!", sep="") #concatenate
paste("Hello ", " World", "!", sep="")
```

```{r}
x <- 5
paste("x=", x)
cat("x=", x, "\n", sep="\n")
```

```{r}
s <- c("Hello", " ", "World", "!")
paste(s)
paste(s, sep="")
paste(s, collapse="")
```

```{r}
paste(1:3, "Hello World!", sep=":")
```

```{r}
nchar("Hello World!")
```

```{r}
s <- "Hello World"
substring(s, 4, 6)
```

```{r}
s <- "foo-->bar-->baz"
strsplit(s, "-->")
```

```{r}
# Regex
s <- "foo, bar, baz"
strsplit(s, ", *")
```

```{r}
s <- apply(matrix(LETTERS[1:24], nr=4), 2, paste, collapse="")
s
```

```{r}
grep("O", s)
grep("O", s, value=T)
```

```{r}
regexpr("o", "Hello")
```

```{r}
regexpr("o", c("Hello", "World!"))
```

```{r}
s <- "foo    bar baz"
gsub(" ", "", s)   # Remove all the spaces
gsub(" +", " ", s)  # Remove multiple spaces and replace them by single spaces

```

```{r}
#The "sub" is similar to "gsub" but only replaces the first occurrence.
s <- "foo bar baz"
sub(" ", "", s)
```

```{r}
# Dates
as.Date("2005-05-15") #ISO 8601
```

```{r}
as.Date("15/05/2005", format="%d/%m/%Y")
as.Date("15/05/05", format="%d/%m/%y")
cat("\n")
as.Date("01/02/03", format="%y/%m/%d")
as.Date("01/02/03", format="%y/%d/%m")

```

```{r}
as.Date("01/02/03", format="%y/%m/%d") - as.Date("01/02/03", format="%y/%d/%m")
```

```{r}
Sys.Date()
format(Sys.Date(), format="%A, %d %B %Y")
```

```{r}
seq(as.Date("2005-01-01"), as.Date("2005-07-01"), by="month")
seq(as.Date("2005-01-01"), as.Date("2005-07-01"), by=31)
```

```{r}
methods(class="Date")
```

```{r}
as.POSIXlt("2005-05-15 21:45:17")
```

```{r}
as.POSIXct(Sys.Date())
```

```{r}
# Reading from dataframes
# option 1
  #d <- read.table("foo.txt")
  #d$Date <- as.Date( as.character( d$Date ) )
# option 2
  #read.table("foo.txt", colClasses=c("Date", "character", rep(10, "numeric")))
```

```{r}

```

```{r}
options(warn=1)
```

```{r}
methods(plot)
```

```{r}
# Import 

# d <- read.table("foo.txt", header=T, sep=",")
# d <- read.csv("txt.csv")
# d <- read.csv2("txt.csv")  # semicolon-separated file, with a
#                            # comma instead of the decimal point.
# d <- read.delim("foo.txt") # Tab-delimited file
# d <- read.fwf("txt.fwf")   # Fixed width fields
```

```{r}
# Excel: this may be trickier: the missing values often appear as "#N/A!" and are mistaken for the start of a comment... You can try

# d <- read.table("foo.csv", header = TRUE, sep = ",", 
#                 na.strings = c("#N/A!", "NA", "@NA"), 
#                 quote = '"',
#                 comment.char = "")
```

```{r}
#If your file only contains number, or only strings, it is wiser to store it in a matrix, not a data.frame. This is what the "scan" function does.
# A numeric matrix
  # x <- scan("foo.txt", sep=",")  # Gives a numeric vector
  # n <- scan("foo.txt", sep=",", nlines=1)
  # x <- matrix(x, nc=n)

# A vector of strings
  #x <- scan("foo.txt", what=character(0))

```

```{r}
# Back tohttps://cran.r-project.org/doc/manuals/R-intro.pdf - Regression

  # fm05 <- lm(y ~ x1 + x2 + x3 + x4 + x5, data = production)
  # fm6 <- update(fm05, . ~ . + x6)
  # smf6 <- update(fm6, sqrt(.) ~ .)
```

```{r}
# Spine (from help)
require(graphics)

op <- par(mfrow = c(2,1), mgp = c(2,.8,0), mar = 0.1+c(3,3,3,1))
n <- 9
x <- 1:n
y <- rnorm(n)
plot(x, y, main = paste("spline[fun](.) through", n, "points"))
lines(spline(x, y))
lines(spline(x, y, n = 210), col = 2)
```

```{r}
# NA handling - http://thomasleeper.com/Rcourse/Tutorials/NA.html
g1 <- c(1, 2, NA, NA, NA, 6, 7)
g2 <- na.omit(g1)
g2
```

```{r}
attributes(g2)$na.action
```

```{r}
sum(g1)
sum(g1, na.rm = TRUE)
```

```{r}
# Cor -> can eliminate only pair-wise NAs ()
x <- c(1, 2, 3, NA, 5, 7, 9)
y <- c(3, 2, 4, 5, 1, 3, 4)
z <- c(NA, 2, 3, 5, 4, 3, 4)
m <- data.frame(x, y, z)
m
```

```{r}
cor(m)  # returns all NAs
```

```{r}
cor(m, use = "complete.obs") # Default, all records with NA removed
```

```{r}
cor(m, use = "pairwise.complete.obs") #kept more records for y~x and y~z
```

```{r}
# Defaut for lm is also casewise deletion
lm <- lm(y ~ x + z, data = m)
summary(lm)
```

```{r}
# Checking length of data used for regression
length(m$y)
length(lm$fitted)
```

```{r}
# checking where missing data is
is.na(m) # only useful for small examples
# image
image(is.na(m), main = "Missing Values", xlab = "Observation", ylab = "Variable", 
    xaxt = "n", yaxt = "n", bty = "n")
axis(1, seq(0, 1, length.out = nrow(m)), 1:nrow(m), col = "white")
axis(2, c(0, 0.5, 1), names(m), col = "white", las = 2)
```

```{r}
# to remove casewise:
m2 <- na.omit(m) # use new variable to keep original dataframe
m
m2
```

```{r}
# Mean imputation
x2 <- x
x2[is.na(x2)] <- mean(x2, na.rm = TRUE)
x
x2
```

```{r}
# Random imputation - conserve mean and variance. 
# How: sample rest of the values to fill NAs
x3 <- x
x3[is.na(x3)] <- sample(x3[!is.na(x3)], sum(is.na(x3)), TRUE)
x
x3
```

```{r}
# Saving R data http://thomasleeper.com/Rcourse/Tutorials/savingdata.html 
set.seed(1)
mydf <- data.frame(x = rnorm(10), y = rnorm(10), z = rnorm(10))
```

```{r}
save(mydf, file = "saveddf.RData") # can be loaded by double-clicking on saved file
```

```{r}
unlink("saveddf.RData") # removing file
```

```{r}
# dput to have a readable format (e.g. for stack overflow)
dput(mydf)
```

```{r}
dput(mydf, "saveddf.txt")
```

```{r}
mydf2 <- dget("saveddf.txt")
mydf2
```

```{r}
mydf==mydf2 # due to rounding
```

```{r}
unlink("saveddf.text")
```

```{r}
# csv
write.csv(mydf, file = "saveddf.csv")
unlink("savedf.csv")
```

```{r}
# Dataframe rearrangement

set.seed(50)
mydf <- data.frame(a = rep(1:2, each = 10), b = rep(1:4, times = 5), c = rnorm(20), 
    d = rnorm(20), e = sample(1:20, 20, FALSE))
head(mydf)
```

```{r}
# manual order change

head(mydf[, c("c", "d", "e", "a", "b")])
# mydf <- mydf[, c(3, 4, 5, 1, 2)]
```

```{r}
# using order
order(mydf$e)
head(mydf[order(mydf$e), ]) # ordering on any column
```

```{r}
# Subset

mydf[mydf$a == 1, ]
mydf[mydf$a == 1 & mydf$b > 2, ]

subset(mydf, a == 1 & b > 2)
subset(mydf, select = c("a", "b"))
subset(mydf, a == 1 & b > 2, select = c("c", "d")) # filter rows & columns

```

```{r}
# Splitting 

split(mydf, mydf$a) # -> splitting according to values of a
```

```{r}
split(mydf, list(mydf$a, mydf$b))
```

```{r}
lapply(split(mydf, mydf$a), summary) # perform summary on each of the dataframes
```

```{r}
# sampling: splitting into training and test set
# Option 1
s <- sample(1:nrow(mydf), 5, F) #no replacement
s
```

```{r}
# use 5 rows as training set
mydf[s,]
```

```{r}
# test set
mydf[-s, ]
```

```{r}
# Option 2
s2 <- rbinom(nrow(mydf), 1, 0.2) #won't necessarily give exactly 5 rows
s2
```

```{r}
mydf[s2,]
mydf[-s2,]
```

```{r}
# Recoding vectors
library(car)
```

```{r}
b <- 1:20
#h <- recode(b, "1:5=1: 6:10=2; else=NA") # incredibly this creates an error
e <- recode(b, "1:5=1; 6:10=2; else=NA")
e
f <- recode(b, "lo:5=1; 6:10=2; 11:15=3; 16:hi=4; else=NA")
f
```

```{r}
e <- recode(h, "NA=99")
e
```

```{r}
# Reconding on multiple variables
i <- expand.grid(1:4, 1:2)
i
```

```{r}
interaction(i$Var1, i$Var2) # creates all possible combinations
```

```{r}
# Scaling
set.seed(1)
n <- 30
mydf <- data.frame(x1 = rbinom(n, 1, 0.5), x2 = rbinom(n, 1, 0.1), x3 = rbinom(n, 
    1, 0.5), x4 = rbinom(n, 1, 0.8), x5 = 1, x6 = sample(c(0, 1, NA), n, TRUE))
```

```{r}
str(mydf)
```

```{r}
mydf$x1 + mydf$x2 - mydf$x3 # vector operations
with(mydf, x1+x2-x3) # with to indicate dataframe

```

```{r}
rowSums(mydf)
rowSums(mydf, na.rm = T)
data.frame(1:n, rowSums(mydf, na.rm = T))
```

```{r}
rowMeans(mydf)
```

```{r}
apply(mydf, 1, var) # 2nd argument: 1 for rows, 2 for columns, c(1, 2)  rows & columns.
apply(mydf, 2, var)
sapply(mydf, var) # over list or vector
```

```{r}
# adding a variable
newvar <- numeric(nrow(mydf))
newvar[mydf$x1 == 1] <- with(mydf[mydf$x1 == 1, ], x2 + x3)
newvar[mydf$x1 == 0] <- with(mydf[mydf$x1 == 0, ], x3 + x4 + x5)
newvar
```

```{r}
newvar[mydf$x1 == 1] <- with(mydf, x2 + x3) # here different lengths !
```

```{r}
# Matrices
set.seed(1)
a <- rnorm(100)
quantile(a, c(0.025, 0.975))
quantile(a, seq(0, 1, by = 0.1))
```

```{r}
summary(as.logical(rbinom(1000, 1, 0.5)))
```

```{r}
summary(factor(a)) # for factor, returns all value
```

```{r}
# Tables
set.seed(1)
a <- sample(1:5, 25, T)
a
```

```{r}
table(a)
```

```{r}
prop.table(table(a)) # to obtain percentages
prop.table(table(a)) *100
```

```{r}
cbind(table(a), prop.table(table(a))*100)
```

```{r}
# multi-variate
b <- rep(c(1, 2), length = 25)
table(a, b)
```

```{r}
c <- rep(c(3, 4, 5), length = 25)
table(a, b, c)
```

```{r}
ftable(a, c, c) # provides more compact format
```

```{r}
xtabs(~a + b) # right hand formulas same as table
```

```{r}
x <- table(a, b)
addmargins(x)
```

```{r}
prop.table(table(a, b), 1) # proportions by rows
prop.table(table(a, b), 2)
```

```{r}
# Correlations
set.seed(1)
n <- 1000
x1 <- rnorm(n, -1, 10)
x2 <- rnorm(n, 3, 2)
y <- 5 * x1 + x2 + rnorm(n, 1, 2)
```

```{r}
cor(x1, x2)
cor.test(x1, x2)
```

```{r}
cor(cbind(x1, x2, y)) # input is matrix for correlation matrix
```

```{r}
a <- rnorm(n)
b <- a^2 + rnorm(n)
plot(b~a)
```

```{r}
plot(b ~ a, col = "gray")
curve((x), col = "red", add = TRUE)
curve((x^2), col = "blue", add = TRUE)
```

```{r}
cor(a, b)
cor(a^2, b)
```

```{r}
plot(b~I(a^2), col = "orange")
abline(lm(b~I(a^2)), col = "red")
```

```{r}
layout(matrix(1:2, nrow = 1))
plot(b ~ a, col = "gray")
curve((x^2), col = "blue", add = TRUE)
plot(b ~ I(a^2), col = "gray")
curve((x), col = "blue", add = TRUE)
```

```{r}
# Rounding
height <- c(167, 164, 172, 158, 181, 179)
mean(height)
```

```{r}
signif(mean(height), 4)
round(mean(height), 1)
round(mean(height), -2)
```

```{r}
options(digits = 5)
sd(height)
options(digits = 2)
sd(height)
```

```{r}
options(scipen = -10) #positive value to get fixed notation
10000000
```

```{r}
# sprintf
sprintf("%.3f", pi)
sprintf("%05.1f", pi)
```

```{r}
# Plots as data summary
set.seed(1)
a <- rnorm(30)
hist(a, col = "gray20", border = "lightgray")
```

```{r}
density(a)
plot(density(a))
```

```{r}
hist(a, freq = FALSE, col = "gray20", border = "lightgray")
lines(density(a), col = "red", lwd = 2)
```

```{r}
b <- c(3, 4.5, 5, 8, 3, 6)
barplot(b, names.arg = letters[1:length(b)], horiz = F)
```

```{r}
d <- rbind(c(2, 4, 1), c(6, 1, 3))
d
barplot(d, names.arg = letters[1:3])
```

```{r}
barplot(d, beside = T)
```

```{r}
layout(matrix(1:2, nrow = 1))
barplot(b, names.arg = letters[1:6], horiz = TRUE, las = 2)
dotchart(b, labels = letters[1:6], xlim = c(0, 8))
```

```{r}
boxplot(a)
```

```{r}
e <- rnorm(100, 1, 1)
f <- rnorm(100, 2, 4)
boxplot(e, f)
```

```{r}
g1 <- c(e, f)
g2 <- rep(c(1, 2), each = 100)
boxplot(g1 ~ g2)
```

```{r}
# Scatterplot
x1 <- rnorm(1000)
x2 <- rnorm(1000)
x3 <- x1 + x2
x4 <- x1 + x3
```

```{r}
plot(x1, x2)
plot(x2~x1)
```

```{r}
layout(matrix(1:3, nrow = 1))
plot(x1, x2)
plot(x1, x3)
plot(x1, x4)
```

```{r}
pairs(~x1 + x2 + x3 + x4)
```

```{r}
colors()[1:10]
length(colors())
colors()[600]
```

```{r}
set.seed(100)
z <- sample(1:4, 100, TRUE)
x <- rnorm(100)
y <- rnorm(100)
plot(x, y, pch = 15, col = c("red", "blue"))
```

```{r}
c("red", "blue", "green", "orange")[z]
plot(x, y, pch = 15, col = c("red", "blue", "green", "orange")[z]) #indexing colors on z groups
```

```{r}
# Analysis of variance 

set.seed(100)
tr <- rep(1:4, each = 30)
y <- numeric(length = 120)
y[tr == 1] <- rnorm(30, 5, 1)
y[tr == 2] <- rnorm(30, 4, 2)
y[tr == 3] <- rnorm(30, 4, 5)
y[tr == 4] <- rnorm(30, 1, 2)
```

```{r}
aov(y~tr)
```

```{r}
summary(aov(y ~ factor(tr)))
```

```{r}
oneway.test(y ~ tr)
```

```{r}
oneway.test(y ~ factor(tr), var.equal = TRUE)
```

```{r}
by(y, tr, FUN = mean)
```

```{r}
tapply(y, tr, FUN = mean) # same thing
```

```{r}
# Distributions
options(scipen = F)
options(digits = 5)
dnorm(0)  # density
dnorm(0, mean=-1)
```

```{r}
pnorm(0)  # cumulative
pnorm(1.65) # 95% normal confidence interval
pnorm(1.96)
pnorm(1.96) - pnorm(-1.96)
```

```{r}
qnorm(c(0.025, 0.975))  # quantile
pnorm(qnorm(c(0.025, 0.975)))
```

```{r}
# other distribution
dbinom(0, 1, 0.5)
pbinom(0, 1, 0.5)
qbinom(.95, 1, 0.5) # because binomial discrete distribution
```

```{r}
# Formulae
myformula <- ~x
class(myformula)
```

```{r}
# interactions
y ~ x1 * x2
y ~ x1:x2 # without the variables themselves 
y ~ -1 + x1 * x2 # drop the intercept
y ~ x + I(x^2) # without I() R thinks x^2 is a duplicate
```

```{r}
# As strings
("y ~ x") == (y ~ x)
as.formula("y~x")
as.character(y ~ x) # Formula indexed with operand first
```

```{r}
terms(y ~ x1 + x2)
```

```{r}
update(y ~ x, ~. + x2)
update(y ~ x, z ~ .)
```

```{r}
# Bivariate Regression

set.seed(1)
bin <- rbinom(1000, 1, 0.5)

out <- 2 * bin + rnorm(1000)

by(out, bin, mean)
```

```{r}
t.test(out ~ bin)
```

```{r}
lm(out ~ bin) # slope = mean difference
```

```{r}
summary(lm(out~bin))$coef
```

```{r}
plot(out ~ bin, col = "gray")
points(0:1, by(out, bin, mean), col = "blue", bg = "blue", pch = 23)
abline(coef(lm(out ~ bin)), col = "blue")
```

```{r}
set.seed(1)
x <- runif(1000, 0, 10)
y <- 3 * x + rnorm(1000, 0, 5)
```

```{r}
# glm plots
set.seed(1)
n <- 100
x <- runif(n, 0, 1)
y <- rbinom(n, 1, x) # more outcomes of 1 as x -> 1
```

```{r}
plot(y ~ x, col = NULL, bg = rgb(0, 0, 0, 0.5), pch = 21) # bg: background color (for points)
abline(lm(y ~ x), lwd = 2) # lwd: line width (default: 1)
# here linear doesn't work
```

```{r}
m1 <- glm(y ~ x, family = binomial(link = "logit"))
```

```{r}
newdf <- data.frame(x = seq(0, 1, length.out = 100))
newdf
```

```{r}
newdf$pout_logit <- predict(m1, newdf, se.fit = TRUE, type = "response")$fit
newdf[95:100,]
```

```{r}
# build confidence intervals from standard error
newdf$pse_logit <- predict(m1, newdf, se.fit = TRUE, type = "response")$se.fit
newdf$plower_logit <- newdf$pout_logit - (1.96 * newdf$pse_logit)  # 95% CI lower bound
newdf$pupper_logit <- newdf$pout_logit + (1.96 * newdf$pse_logit)  # 95% CI upper bound
# qnorm(c(0.025, 0.975)) = (-1.96, +1.96)
```

```{r}
newdf[1:10,c(1,2,3,5,4)]
```

```{r}
# now plot predicted values
with(newdf, plot(pout_logit ~ x, type = "l", lwd = 2))
with(newdf, lines(pupper_logit ~ x, type = "l", lty = 2))
with(newdf, lines(plower_logit ~ x, type = "l", lty = 2))
```

```{r}
library(ggplot2)
# http://ggplot2.tidyverse.org/reference/

# basic R's plot
plot(iris$Sepal.Width, iris$Sepal.Length)
```

```{r}
head(mpg)
ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point()
ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point() + geom_smooth(method = "lm") # with regression
```

```{r}
ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point(aes(color = class)) # adding a dimension
ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point(aes(size = class))
ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point(aes(shape = class))
ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point(aes(alpha = class))
```

```{r}
ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point() + facet_grid(. ~ cyl) # 2D grid, rows ~ cols
ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point() + facet_grid(drv ~ .) 
ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point() + facet_grid(drv ~ cyl)
ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point() + facet_wrap( ~ class)
# facet_wrap wraps a 1d sequence of panels into 2d. This is generally a better use of screen space than facet_grid because most displays are roughly rectangular.
```

```{r}
ggplot(data = mpg, aes(x = displ, y = hwy)) +  geom_point() + geom_smooth()
ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point() + geom_smooth(se = FALSE)  # Turn off confidence band
```

```{r}
ggplot(data = mpg, aes(x = class, y = hwy)) + geom_boxplot() # scatterplot
ggplot(data = mpg, aes(x = reorder(class, hwy), y = hwy)) + geom_boxplot() # reorder (mean)
ggplot(data = mpg, aes(x = reorder(class, hwy, median), y = hwy)) + geom_boxplot() # reorder by median
```
```{r}
# jitter
ggplot(data = mpg, aes(x = cty, y = hwy)) + geom_point()
ggplot(data = mpg, aes(x = cty, y = hwy)) + geom_point(position = "jitter")
ggplot(data = mpg, aes(x = cty, y = hwy)) + geom_jitter() # identical option
```
```{r}
names(diamonds) # part of ggplot2 package
ggplot(data = diamonds,aes(x =cut)) + geom_bar(aes(fill =cut)) # fill: color inside of bars
ggplot(data = diamonds, aes(x =cut)) + geom_bar(aes(color =cut)) # color: line around the bars
ggplot(data = diamonds, aes(x = color)) + geom_bar(aes(fill = cut), position = "dodge")
```



```{r}
str(diamonds)
ggplot(data = diamonds, aes(x = carat)) + geom_histogram(binwidth = 1)
ggplot(data = diamonds, aes(x = carat)) + geom_histogram(binwidth = 0.01)
ggplot(data = diamonds, aes(x = carat)) + geom_histogram() #stat_bin: binwidth defaulted to range/30.
```

```{r}
zoom <- coord_cartesian(xlim = c(55, 70))
ggplot(data = diamonds, aes(x = depth)) + geom_histogram(binwidth = 0.2) + zoom
```

```{r}
ggplot(data = diamonds, aes(x = depth)) + geom_histogram(aes(fill = cut), binwidth = 0.1) + zoom
```

```{r}
# to compare variables
ggplot(data = diamonds, aes(x = depth)) + geom_density(aes(fill = cut)) + zoom
ggplot(data = diamonds, aes(x = depth)) + geom_density(aes(color = cut, fill = cut, alpha=0.1)) + zoom
```

```{r}
ggplot(data = diamonds, aes(x = price)) +geom_histogram(aes(fill = cut), binwidth = 100)
ggplot(data = diamonds, aes(x = price)) + geom_density(aes(color= cut)) # better
```

```{r}
# Visualization of big data
ggplot(data = diamonds, aes(x = carat, y = price)) + geom_point(aes(color = cut)) # not helpful
ggplot(data = diamonds, aes(x = carat, y = price)) + geom_bin2d()
ggplot(data = diamonds, aes(x = carat, y = price)) + geom_density2d()
ggplot(data = diamonds, aes(x = carat, y = price)) + geom_point() + geom_density2d()
ggplot(data = diamonds, aes(x = carat, y = price)) + geom_smooth(aes(group = cut))
ggplot(data = diamonds, aes(x = carat, y = price)) + geom_smooth(aes(color = cut), method = "loess", se=F)
```

```{r}
ggplot(data = diamonds, aes(x = carat, y = price)) + geom_point(size = 0.5, alpha = 0.1)
```

```{r}
# ggsave("my-plot.pdf") # PDF more crisp
# ggsave("my-plot.png")
# ggsave("my-plot.pdf", width = 6, height = 6)
# ggsave("my-plot.png", width = 6, height = 6)
```

```{r}
# More plot examples
bnames = read.csv("bnames.csv", stringsAsFactors = FALSE) 
births = read.csv("births.csv", stringsAsFactors = FALSE)
head(births, 3)
head(bnames, 3)
```

```{r}
# Simple plot
Quentin <- bnames[bnames$name == "Letitia", ] 
qplot(year, prop, data = Quentin, geom = "line")
```

```{r}
# Interactions
michaels <- bnames[bnames$name == "Quentin" | bnames$name == "Alexis" | bnames$name == "Gina", ]
qplot(year, prop, data = michaels, geom = "line", color = interaction(sex, name))
```

```{r}
# dplyr
library(dplyr)
bnames = tbl_df(bnames) 
births = tbl_df(births) 
class(bnames)
```

```{r}
# filter
filter(bnames, sex == "girl" & (year == 1900 | year == 2000))
```

```{r}
# Select columns
select(bnames, starts_with("sound"))
# Rename column
rename(iris, petal_length = Petal.Length)
```

```{r}
# sort
arrange(bnames, desc(prop))[3,]
# year where Quentin was most popular
arrange(filter(bnames, name == "Quentin"), desc(prop))[1,]
```

```{r}
# add columns
mutate(births, double = 2 * births)
# transmute to delete old columns
```

```{r}
# Summarize
summarise(vivian,min = min(prop),mean = mean(prop), max = max(prop), number = n(), number_distinct = n_distinct(prop))
```

```{r}

bnames2 = left_join(bnames, births, by = c("year","sex"))
bnames2 = mutate(bnames2, n = round(prop * births))
# Group
by_name = group_by(bnames2, name)
by_name
totals = summarise(by_name, total = sum(n)) 
totals
```

```{r}
bnames2
name_sex = group_by(bnames2, name, year) 
totals2 = summarise(name_sex, total = sum(n)) 
head(totals2)
```

```{r}
arrange(bnames2, name)[1:3,]
```

```{r}
ungroup(by_name)
```

```{r}
# other example
year_sex = group_by(bnames2, year, sex) 
ytotals = summarise(year_sex, births = sum(n)) 
ytotals
```

```{r}
bnames2
```

```{r}
# ISLR 
# Chap 6 Ridge regression / Lasso
# Ex 8
set.seed(1)
X = rnorm(100)
eps = rnorm(100)

beta0 = 3
beta1 = 2
beta2 = -3
beta3 = 0.3
Y = beta0 + beta1 * X + beta2 * X^2 + beta3 * X^3 + eps

# Use regsubsets to select best model having polynomial of XX of degree 10
library(leaps)
data.full = data.frame(y = Y, x = X)
mod.full = regsubsets(y ~ poly(x, 10, raw = T), data = data.full, nvmax = 10)
mod.summary = summary(mod.full)

# Find the model size for best cp, BIC and adjr2
which.min(mod.summary$cp)
which.min(mod.summary$bic)
which.max(mod.summary$adjr2)

# Plot cp, BIC and adjr2
plot(mod.summary$cp, xlab = "Subset Size", ylab = "Cp", pch = 20, type = "l")
points(3, mod.summary$cp[3], pch = 4, col = "red", lwd = 7)
plot(mod.summary$bic, xlab = "Subset Size", ylab = "BIC", pch = 20, type = "l")
points(3, mod.summary$bic[3], pch = 4, col = "red", lwd = 7)
plot(mod.summary$adjr2, xlab = "Subset Size", ylab = "Adjusted R2", pch = 20, 
    type = "l")
points(3, mod.summary$adjr2[3], pch = 4, col = "red", lwd = 7)

# Coefs found by regression (replaces x^3 by x^7)
coefficients(mod.full, id = 3)

```

```{r}
# Now lasso
library(glmnet)
xmat = model.matrix(y ~ poly(x, 10, raw = T), data = data.full)[, -1] # remove intercept (first column)
mod.lasso = cv.glmnet(xmat, Y, alpha = 1) # cv.glmnet: cross validation to select best lambda
best.lambda = mod.lasso$lambda.min
best.lambda
plot(mod.lasso)

# Next fit the model on entire data using best lambda
best.model = glmnet(xmat, Y, alpha = 1)
predict(best.model, s = best.lambda, type = "coefficients")
#Lasso also picks X^5 over X^3. It also picks X^7 with negligible coefficient.

```

```{r}
# ISLR 
# Chap 7 Non-linear Modeling - Splines, GAM
library(ISLR)
attach(Wage)
```

```{r}
fit=lm(wage~poly(age,4,raw=T),data=Wage)  # raw=T to obtain coefficients of poly directly
coef(summary(fit)) # below we see small coef (and p-value) for order 3 & 4

# Equivalent expressions
fit2a=lm(wage~age+I(age^2)+I(age^3)+I(age^4),data=Wage)
coef(fit2a)
fit2b=lm(wage~cbind(age,age^2,age^3,age^4),data=Wage) # cbind in formulas <-> I() wrapper
coef(fit2b)
```

```{r}
# Prediction
agelims=range(age)
age.grid=seq(from=agelims[1],to=agelims[2])
preds=predict(fit,newdata=list(age=age.grid),se=TRUE) # creates list of ages where we want prediction (see below)
se.bands=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
par(mfrow=c(1,2),mar=c(4.5,4.5,1,1),oma=c(0,0,4,0)) # mar/ oma: margins of plot
plot(age,wage,xlim=agelims,cex=.5,col="darkgrey")
title("Degree-4 Polynomial",outer=T)
lines(age.grid,preds$fit,lwd=2,col="blue")
matlines(age.grid,se.bands,lwd=1,col="blue",lty=3)
```

```{r}
age.grid=seq(from=agelims[1],to=agelims[2]) # Or: age.grid2=sort(unique(age))
list(age=age.grid) # $age list

predict(fit,list(age=c(70))) # test for 1 age - still need to create an $age list
```

```{r}
preds
```

```{r}
# Use of ANOVA = Analysis of variance. To determined which degree is needed
fit.1=lm(wage~age,data=Wage)
fit.2=lm(wage~poly(age,2),data=Wage)
fit.3=lm(wage~poly(age,3),data=Wage)
fit.4=lm(wage~poly(age,4),data=Wage)
fit.5=lm(wage~poly(age,5),data=Wage)
anova(fit.1,fit.2,fit.3,fit.4,fit.5) 
# Or obtain these p-values by exploiting the fact that poly() creates orthogonal polynomials:
coef(summary(fit.5)) # and check P-values. Each p-value corresponds to comparison from model w previous one
```

```{r}
# Generalized linear function
fit=glm(I(wage>250)~poly(age,4),data=Wage,family=binomial) # plus create binary response on the fly with I()
preds=predict(fit,newdata=list(age=age.grid),se=T) # default: type="link" = predictions for logit
# other option type="response" (probability vs odds)
```

```{r}
# Step function
table(cut(age,4))
fit=lm(wage~cut(age,4),data=Wage) # cut() returns an ordered categorical variable. Breaks= can be used 
coef(summary(fit))

```

```{r}
fit.1=lm(wage~education+age,data=Wage)
fit.2=lm(wage~education+poly(age,2),data=Wage)
fit.3=lm(wage~education+poly(age,3),data=Wage)
anova(fit.1,fit.2,fit.3) # Anova can compare not orthogonal polynomials, and that have other terms
```

```{r}
# Splines
library(splines)
fit=lm(wage~bs(age,knots=c(25,40,60)),data=Wage) # 3 knots --> 7 degrees of freedom --> 1 intercept + 6 basis functions
pred=predict(fit,newdata=list(age=age.grid),se=T)
plot(age,wage,col="gray")
lines(age.grid,pred$fit,lwd=2)
lines(age.grid,pred$fit+2*pred$se,lty="dashed")
lines(age.grid,pred$fit-2*pred$se,lty="dashed")
```

```{r}
pred=predict(fit,se=T)
plot(age,wage,col="gray")
lines(age,pred$fit,lwd=2) # too many points: that's why we changed age into a list of all unique ages
```

```{r}
# That works too - just need to have same length of data for x and y in lines(x, y...)
# = bewtween age.grid2 and pred2$fit
age.grid2=sort(unique(age))
pred2=predict(fit,newdata=list(age=age.grid2),se=T)
plot(age,wage,col="gray")
lines(age.grid2,pred2$fit,lwd=2)
```

```{r}
str(pred2)
```

```{r}
dim(bs(age,knots=c(25,40,60), degree = 3)) # bs generates matrix with 6 basis functions (see below)
dim(bs(age,df=6)) # df option to produce a spline with knots at uniform quantiles of the data
attr(bs(age,df=6),"knots")
```

```{r}
# Natural spline: ns()
fit2=lm(wage~ns(age,df=4),data=Wage)
pred2=predict(fit2,newdata=list(age=age.grid),se=T) #list or data.frame for predit(<class lm>, ..)
plot(age,wage,xlim=agelims,cex=.5,col="darkgrey")
lines(age.grid, pred2$fit,col="red",lwd=2)
```

```{r}
# Smooth spline
plot(age,wage,xlim=agelims,cex=.5,col="darkgrey")
title("Smoothing Spline")
fit=smooth.spline(age,wage,df=16) # forces 16 degrees of freedom
fit2=smooth.spline(age,wage,cv=TRUE) # choose value of lambda by cross validation --> 6.8 degrees of freedom
fit2$df
lines(fit,col="red",lwd=2)
lines(fit2,col="blue",lwd=2)
legend("topright",legend=c("16 DF","6.8 DF"),col=c("red","blue"),lty=1,lwd=2,cex=.8)
```

```{r}
# Local regresssion (Loess)
plot(age,wage,xlim=agelims,cex=.5,col="darkgrey")
title("Local Regression")
fit=loess(wage~age,span=.2,data=Wage) # span=0.2 means use 20% of observations
fit2=loess(wage~age,span=.5,data=Wage) # 50% of observations --> smoother
lines(age.grid,predict(fit,data.frame(age=age.grid)),col="red",lwd=2) # predict needs data.frame for loess fit
lines(age.grid,predict(fit2,data.frame(age=age.grid)),col="blue",lwd=2)
legend("topright",legend=c("Span=0.2","Span=0.5"),col=c("red","blue"),lty=1,lwd=2,cex=.8)
```

```{r}

```

```{r}

```

```{r}

```

```{r}

```

```{r}

```

```{r}

```

```{r}

```

```{r}

```

```{r}

```

